diff --git a/README.md b/README.md index 1317e03..a3a6c43 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,7 @@ Before you go ahead make sure you have [Python 3.10](https://www.python.org/down ### How to install and setup an Elasticsearch server locally -Before you go ahead make sure you have [docker](https://docker.com/) installed and available in your system. The below steps are required only for the initial setup. To re-run an existing docker container, run `docker start my_es_server` and enter the previously saved password if prompted. - +Before you go ahead make sure you have [docker](https://docker.com/) installed and available in your system. The steps below are required only for the initial setup. To re-run an existing docker container, run `docker start my_es_server` and enter the previously saved password if prompted. 1. Create the elasticsearch server: ```bash @@ -110,11 +109,11 @@ Before you go ahead make sure you have [docker](https://docker.com/) installed a ```bash printenv ``` -- To query Database using Django models +- To query the database using Django models ```bash python3 manage.py dbshell ``` -- Make Django Migrations +- Make Django migrations ```bash python3 manage.py makemigrations ``` diff --git a/scripts/get_ensembl_metadata.py b/scripts/get_ensembl_metadata.py index 096ffb8..c65769e 100644 --- a/scripts/get_ensembl_metadata.py +++ b/scripts/get_ensembl_metadata.py @@ -10,16 +10,16 @@ def get_taxon_metadata(db_connection): """ The function connects with ensembl My sql database and runs a sql query to retrieve data needed to load - into django models as fixtures. + into django models as fixtures. Parameters: db_connection (sqlalchemy.create_engine): A sqlalchemy engine. Returns: - pandas dataframe (pd.DataFrame): tabluar data. + pandas dataframe (pd.DataFrame): tabular data. """ - + query = f"""select distinct taxonomy_id ,o.name ,url_name ,display_name ,scientific_name ,strain from organism o diff --git a/scripts/get_metazoa_species.py b/scripts/get_metazoa_species.py index 117275b..f83fea8 100644 --- a/scripts/get_metazoa_species.py +++ b/scripts/get_metazoa_species.py @@ -9,7 +9,6 @@ def get_taxon_ids(url): - """ The function scrapes the metazoa taxonomy ids data from a fixed url: https://metazoa.ensembl.org/species.html @@ -19,7 +18,7 @@ def get_taxon_ids(url): that belong to metazoa. """ - + response = requests.get(url) soup = BeautifulSoup(response.text, "lxml") @@ -33,9 +32,9 @@ def get_taxon_ids(url): def get_taxon_tree(taxon_ids, db_engine): """ - The function connects with ensembl My sql database + The function connects with ensembl MySQL database and runs a sql query to retrieve taxon tree data needed to load - into django models as fixtures. + into django models as fixtures. Parameters: taxon_ids (List): list of taxonomy ids for which entire tree structures @@ -43,7 +42,7 @@ def get_taxon_tree(taxon_ids, db_engine): db_conn (sqlalchemy.create_engine): A sqlalchemy engine. Returns: - pandas dataframe (pd.DataFrame): tabluar data. + pandas dataframe (pd.DataFrame): tabular data. """ diff --git a/scripts/get_phrases.py b/scripts/get_phrases.py index a6ccb51..5885136 100644 --- a/scripts/get_phrases.py +++ b/scripts/get_phrases.py @@ -20,7 +20,7 @@ def get_taxon_ids(url): that belong to metazoa. """ - + response = requests.get(url) soup = BeautifulSoup(response.text, "lxml") @@ -34,7 +34,7 @@ def get_taxon_ids(url): def preprocess_name(text): """ - Does some basic text preprocessing like + Does some basic text preprocessing like removing special characters, extra spaces, etc. """ name = text["name"] @@ -53,9 +53,9 @@ def preprocess_name(text): def get_taxon_names(taxon_ids, db_conn): """ - The function connects with ensembl My sql database - and runs a sql query to retrieve taxon names like synonyms, common names, - scientif names and converts into elastic search synonym file format. + The function connects with ensembl MySQL database + and runs a sql query to retrieve taxon names like synonyms, common names, + scientific names and converts into elastic search synonym file format. Parameters: taxon_ids (List): list of taxonomy ids for which entire tree structures @@ -63,7 +63,7 @@ def get_taxon_names(taxon_ids, db_conn): db_conn (sqlalchemy.create_engine): A sqlalchemy engine. Returns: - pandas dataframe (pd.DataFrame): tabluar data. + pandas dataframe (pd.DataFrame): tabular data. """ diff --git a/scripts/get_synonyms.py b/scripts/get_synonyms.py index b60aff6..ce1a047 100644 --- a/scripts/get_synonyms.py +++ b/scripts/get_synonyms.py @@ -20,7 +20,7 @@ def get_taxon_ids(url): that belong to metazoa. """ - + response = requests.get(url) soup = BeautifulSoup(response.text, "lxml") @@ -34,10 +34,10 @@ def get_taxon_ids(url): def preprocess_name(text): """ - Does some basic text preprocessing like + Does some basic text preprocessing like removing special characters, extra spaces, etc. """ - + name = text["name"] if text["name_class"] != "scientific name": name = re.sub(r"[,.;@#?!&$\(\)]+\ *", " ", name) @@ -49,9 +49,9 @@ def preprocess_name(text): def get_taxon_names(taxon_ids, db_conn): """ - The function connects with ensembl My sql database - and runs a sql query to retrieve taxon names like synonyms, common names, - scientif names and converts into elastic search synonym file format. + The function connects with ensembl MySQL database + and runs a sql query to retrieve taxon names like synonyms, common names, + scientific names and converts into elastic search synonym file format. Parameters: taxon_ids (List): list of taxonomy ids for which entire tree structures @@ -59,10 +59,10 @@ def get_taxon_names(taxon_ids, db_conn): db_conn (sqlalchemy.create_engine): A sqlalchemy engine. Returns: - pandas dataframe (pd.DataFrame): tabluar data. + pandas dataframe (pd.DataFrame): tabular data. """ - + query_df = pd.DataFrame() for i in range(len(taxon_ids[:])): taxon_id = taxon_ids[i] diff --git a/scripts/get_taxon.py b/scripts/get_taxon.py index 32e9e6f..0c50d24 100644 --- a/scripts/get_taxon.py +++ b/scripts/get_taxon.py @@ -18,7 +18,7 @@ def get_taxon_ids(url): that belong to metazoa. """ - + response = requests.get(url) soup = BeautifulSoup(response.text, "lxml") @@ -32,9 +32,9 @@ def get_taxon_ids(url): def get_taxon_tree(taxon_ids, db_engine): """ - The function connects with ensembl My sql database + The function connects with ensembl MySQL database and runs a sql query to retrieve taxon tree data needed to load - into django models as fixtures. + into django models as fixtures. Parameters: taxon_ids (List): list of taxonomy ids for which entire tree structures @@ -42,11 +42,10 @@ def get_taxon_tree(taxon_ids, db_engine): db_engine (sqlalchemy.create_engine): A sqlalchemy engine. Returns: - pandas dataframe (pd.DataFrame): tabluar data. + pandas dataframe (pd.DataFrame): tabular data. """ - tree_df = pd.DataFrame() for i in range(len(taxon_ids[:])): taxon_id = taxon_ids[i] @@ -76,7 +75,14 @@ def get_taxon_tree(taxon_ids, db_engine): # get data json for taxon_search.NCBITaxaNode model pk_col = ["taxon_id"] - field_col = ["parent_id", "rank", "genbank_hidden_flag", "left_index", "right_index", "root_id"] + field_col = [ + "parent_id", + "rank", + "genbank_hidden_flag", + "left_index", + "right_index", + "root_id", + ] m1_df = metazoa_df[pk_col + field_col].drop_duplicates() m1_df["model"] = "taxon_search.NCBITaxaNode" diff --git a/scripts/get_taxon_flat.py b/scripts/get_taxon_flat.py index 7df6ea4..fb4e975 100644 --- a/scripts/get_taxon_flat.py +++ b/scripts/get_taxon_flat.py @@ -33,9 +33,9 @@ def get_taxon_ids(url): def get_taxon_tree_flat(taxon_ids, db_conn): """ - The function connects with ensembl My sql database + The function connects with ensembl MySQL database and runs a sql query to retrieve taxon tree data needed to load - into django models as fixtures. + into django models as fixtures. Parameters: taxon_ids (List): list of taxonomy ids for which entire tree structures @@ -43,10 +43,10 @@ def get_taxon_tree_flat(taxon_ids, db_conn): db_conn (sqlalchemy.create_engine): A sqlalchemy engine. Returns: - pandas dataframe (pd.DataFrame): tabluar data. + pandas dataframe (pd.DataFrame): tabular data. """ - + unique_taxons = list(set(taxon_ids)) tree_df = pd.DataFrame() @@ -87,7 +87,6 @@ def get_taxon_tree_flat(taxon_ids, db_conn): all_ids = metazoa_ids + add_ids metazoa_df = get_taxon_tree_flat(all_ids, db_conn) - # the below code converts the dataframe into json format # required by django to load as fixtures. pk_col = [] diff --git a/src/index_documents.py b/src/index_documents.py index 3d83073..0f29e5c 100644 --- a/src/index_documents.py +++ b/src/index_documents.py @@ -22,7 +22,11 @@ def __init__(self): def add_arguments(self, parser): parser.add_argument( - "-b", "--batch-size", dest="batch_size", type=int, help="Number of items to index at once." + "-b", + "--batch-size", + dest="batch_size", + type=int, + help="Number of items to index at once.", ) parser.add_argument( "-r", @@ -32,12 +36,26 @@ def add_arguments(self, parser): help="Remove objects from the index that are no longer present in \ the database.", ) - parser.add_argument("-i", "--index", dest="index", type=str, help="Specify which index to update.") parser.add_argument( - "-c", "--clear_index", action="store_true", default=False, help="Clear and rebuild index." + "-i", + "--index", + dest="index", + type=str, + help="Specify which index to update.", ) parser.add_argument( - "-a", "--age", dest="age", default=0, help="Number of hours back to consider objects new." + "-c", + "--clear_index", + action="store_true", + default=False, + help="Clear and rebuild index.", + ) + parser.add_argument( + "-a", + "--age", + dest="age", + default=0, + help="Number of hours back to consider objects new.", ) def handle(self, *args, **options): diff --git a/src/taxon_search/documents.py b/src/taxon_search/documents.py index eb06103..df68390 100644 --- a/src/taxon_search/documents.py +++ b/src/taxon_search/documents.py @@ -27,13 +27,11 @@ synonyms=load_synonym_file(ph_file_path), ) -# Elastic search index time analyzer to be used while indexing documents. +# Elastic search index time analyzer to be used while indexing documents. index_analyzer = analyzer( "index_analyzer", tokenizer="standard", - filter=["lowercase", "stop", - autophrase_syn_filter, - synonym_token_filter], + filter=["lowercase", "stop", autophrase_syn_filter, synonym_token_filter], ) #### Define Ensembl Taxonomy Flat on elastic search with appropiate settings. @@ -45,15 +43,16 @@ @taxon_flat_index.document class TaxonFlatDocument(Document): """ - Elastic Search Document Model for Index data from the + Elastic Search Document Model for Index data from the NCBITaxonFlat Django Model. - Auto Indexing signals are disabled. For re-indexing or updating the + Auto Indexing signals are disabled. For re-indexing or updating the index, run the below command in `src` directory. python3 manage.py search_index --rebuild """ + taxon_id = fields.IntegerField(attr="taxon_id") parent_id = fields.IntegerField(attr="parent_id") left_index = fields.IntegerField(attr="left_index") diff --git a/src/taxon_search/migrations/0001_initial.py b/src/taxon_search/migrations/0001_initial.py index 9614df1..8952c04 100644 --- a/src/taxon_search/migrations/0001_initial.py +++ b/src/taxon_search/migrations/0001_initial.py @@ -22,7 +22,8 @@ class Migration(migrations.Migration): ( "parent_id", models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, to="taxon_search.ncbitaxanode" + on_delete=django.db.models.deletion.CASCADE, + to="taxon_search.ncbitaxanode", ), ), ], @@ -36,7 +37,10 @@ class Migration(migrations.Migration): ( "id", models.BigAutoField( - auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", ), ), ("name", models.CharField(db_index=True, max_length=500)), diff --git a/src/taxon_search/migrations/0003_alter_ncbitaxanode_parent_id.py b/src/taxon_search/migrations/0003_alter_ncbitaxanode_parent_id.py index 33eaa30..9fdb446 100644 --- a/src/taxon_search/migrations/0003_alter_ncbitaxanode_parent_id.py +++ b/src/taxon_search/migrations/0003_alter_ncbitaxanode_parent_id.py @@ -6,7 +6,10 @@ class Migration(migrations.Migration): dependencies = [ - ("taxon_search", "0002_rename_taxaname_taxon_id_ncbitaxaname_taxon_id_and_more"), + ( + "taxon_search", + "0002_rename_taxaname_taxon_id_ncbitaxaname_taxon_id_and_more", + ), ] operations = [ @@ -14,7 +17,9 @@ class Migration(migrations.Migration): model_name="ncbitaxanode", name="parent_id", field=models.ForeignKey( - default=0, on_delete=django.db.models.deletion.CASCADE, to="taxon_search.ncbitaxanode" + default=0, + on_delete=django.db.models.deletion.CASCADE, + to="taxon_search.ncbitaxanode", ), ), ] diff --git a/src/taxon_search/migrations/0006_ensemblmetadata.py b/src/taxon_search/migrations/0006_ensemblmetadata.py index 0750489..7ba0f64 100644 --- a/src/taxon_search/migrations/0006_ensemblmetadata.py +++ b/src/taxon_search/migrations/0006_ensemblmetadata.py @@ -15,7 +15,10 @@ class Migration(migrations.Migration): ( "id", models.BigAutoField( - auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", ), ), ("taxonomy_id", models.IntegerField()), diff --git a/src/taxon_search/migrations/0008_ncbitaxaflat.py b/src/taxon_search/migrations/0008_ncbitaxaflat.py index 13b9c31..a017a34 100644 --- a/src/taxon_search/migrations/0008_ncbitaxaflat.py +++ b/src/taxon_search/migrations/0008_ncbitaxaflat.py @@ -15,7 +15,10 @@ class Migration(migrations.Migration): ( "id", models.BigAutoField( - auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", ), ), ("taxon_id", models.IntegerField()), diff --git a/src/taxon_search/migrations/0010_taxonflat.py b/src/taxon_search/migrations/0010_taxonflat.py index c866296..51bf053 100644 --- a/src/taxon_search/migrations/0010_taxonflat.py +++ b/src/taxon_search/migrations/0010_taxonflat.py @@ -15,7 +15,10 @@ class Migration(migrations.Migration): ( "id", models.BigAutoField( - auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", ), ), ("taxon_id", models.IntegerField()), diff --git a/src/taxon_search/migrations/0012_ncbitaxonflat.py b/src/taxon_search/migrations/0012_ncbitaxonflat.py index 2e41e01..12c2e22 100644 --- a/src/taxon_search/migrations/0012_ncbitaxonflat.py +++ b/src/taxon_search/migrations/0012_ncbitaxonflat.py @@ -15,7 +15,10 @@ class Migration(migrations.Migration): ( "id", models.BigAutoField( - auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", ), ), ("taxon_id", models.IntegerField()), diff --git a/src/taxon_search/models.py b/src/taxon_search/models.py index c4a9897..bc09d82 100644 --- a/src/taxon_search/models.py +++ b/src/taxon_search/models.py @@ -7,10 +7,10 @@ class EnsemblMetadata(models.Model): """ EnsemblMetadata Django model class. - All fields defined are self explainatory and + All fields defined are self explainatory and picked up from the ensembl MySQL database aftering joining 'organism' , 'genome', 'division' - and 'data_release' tables present + and 'data_release' tables present in the 'ensmebl_metadata_109' schema/database. refer the scripts/get_ensembl_metadata.py for SQL query used. @@ -32,11 +32,11 @@ class NCBITaxonFlat(models.Model): """ NCBI Taxonomy Django model class. - All fields defined are self explainatory and + All fields defined are self explainatory and picked up from the ensembl MySQL database aftering joining 'ncbi_taxa_node' and 'ncbi_taxa_name' tables present in the 'ncbi_taxonomy_109' schema/database. - + refer the scripts/get_taxon_flat.py for SQL query used. """ diff --git a/src/taxon_search/search.py b/src/taxon_search/search.py index 6eb1433..1c41f71 100644 --- a/src/taxon_search/search.py +++ b/src/taxon_search/search.py @@ -1,15 +1,16 @@ from .documents import TaxonFlatDocument + def search_species(query): """ - The function calls the Elastic Search Document Model using - a query string and returns the results of the elastic search. + The function calls the Elastic Search Document Model using + a query string and returns the results of the elasticsearch. Parameters: query (str): Query string provided by the user as Input Returns: - return_type (List[Dict]): List of Dictonaries containing the + return_type (List[Dict]): List of Dictionaries containing the search results from the elastic search server. """ @@ -21,9 +22,6 @@ def search_species(query): query_results = [] for hit in hits: - # print(hit.name, hit.name_class, hit.taxon_id, hit.species_taxon_id, hit.meta.score) - # print(hit.parent_id) - data = { "taxon_id": hit.taxon_id, "name": hit.name, diff --git a/src/taxon_search/utils.py b/src/taxon_search/utils.py index efd0fa5..bdad4bc 100644 --- a/src/taxon_search/utils.py +++ b/src/taxon_search/utils.py @@ -1,20 +1,23 @@ from sqlalchemy import create_engine, text as sql_text -from .models import EnsemblMetadata, NCBITaxonFlat + +from .models import EnsemblMetadata + TREE_TRAVERSAL_LIMIT = 5 + def load_synonym_file(path): """ - Utility function to load + Utility function to load Elastic search synonym files. Parameters: path (str): Path to the file. Returns: - return_type (List[str]): List of string for elastic search server + return_type (List[str]): List of string for elastic search server to retrieve synonyms/phrases. - + """ syn_list = [] with open(path, "r") as f: @@ -24,11 +27,11 @@ def load_synonym_file(path): print(f"{path} file loaded...") return syn_list - + def run_custom_sql(engine, query): """ - Utility function to execute + Utility function to execute custom sql directly on Django models. Parameters: @@ -36,9 +39,9 @@ def run_custom_sql(engine, query): query (str): A SQL in multi-lined string format. Returns: - return_type (List[List]): List of Lists containing the + return_type (List[List]): List of Lists containing the retrieved query results. - + """ with engine.connect() as cursor: rows = cursor.execute(sql_text(query)) @@ -48,19 +51,19 @@ def run_custom_sql(engine, query): def get_relevant_species(species_dict): """ - Given a species dictonary with taxonomy id, the function - retrieves species from its parent id's and matches them + Given a species dictionary with taxonomy id, the function + retrieves species from its parent id's and matches them with ensembl database. Parameters: - species_dict (Dict): A dictonary with species details like + species_dict (Dict): A dictionary with species details like taxonomy id, parent id, name, etc. Returns: - species_dict (List[Dict]): List of Dictonaries containing the + species_dict (List[Dict]): List of Dictionaries containing the matched relevant species. - parent_name (str): Name of parent taxonomy under + parent_name (str): Name of parent taxonomy under which species match is found. @@ -125,15 +128,15 @@ def get_all_parents(taxon_id): def get_species_from_parent(parent_id): """ The function is called inside the `get_relevant_species` - function to get all species childs present under given a parent taxonomy id. + function to get all species children present under given a parent taxonomy id. Parameters: parent_id (str): Parent Taxonomy Id Returns: - return_type (List): List containing all species ids present under + return_type (List): List containing all species ids present under given parent id tree. - + """ ncbi_engine = create_engine("mysql://anonymous@ensembldb.ensembl.org:3306/ncbi_taxonomy_109") @@ -151,4 +154,4 @@ def get_species_from_parent(parent_id): with ncbi_engine.connect() as cursor: rows = cursor.execute(sql_text(query)) - return [r[0] for r in rows] \ No newline at end of file + return [r[0] for r in rows] diff --git a/src/taxon_search/views.py b/src/taxon_search/views.py index 70f5e5d..6847488 100644 --- a/src/taxon_search/views.py +++ b/src/taxon_search/views.py @@ -5,7 +5,7 @@ from sqlalchemy import create_engine, text as sql_text from .search import search_species -from .models import EnsemblMetadata, NCBITaxonFlat +from .models import EnsemblMetadata from .utils import get_relevant_species pymysql.install_as_MySQLdb() @@ -14,8 +14,8 @@ # Create your views here. def index(request): """ - View function for the index/search page of the - taxonomy search django app. + View function for the index/search page of the + taxonomy search django app. Parameters: request : GET request from the front-end containing the query string. @@ -82,8 +82,8 @@ def index(request): def taxon_tree(request, taxon_id): """ - View function (GET) for the taxon tree page of the - taxonomy search django app. + View function (GET) for the taxon tree page of the + taxonomy search django app. Parameters: request : GET request from the front-end containing the query string.