run black and fix a few typos

Asrst · Sep 29, 2023 · 0fd0b54 · 0fd0b54
1 parent c82074c
commit 0fd0b54
Show file tree

Hide file tree

Showing 19 changed files with 129 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -68,8 +68,7 @@ Before you go ahead make sure you have [Python 3.10](https://www.python.org/down
 
 ### How to install and setup an Elasticsearch server locally
 
-Before you go ahead make sure you have [docker](https://docker.com/) installed and available in your system. The below steps are required only for the initial setup. To re-run an existing docker container, run `docker start my_es_server` and enter the previously saved password if prompted.
-
+Before you go ahead make sure you have [docker](https://docker.com/) installed and available in your system. The steps below are required only for the initial setup. To re-run an existing docker container, run `docker start my_es_server` and enter the previously saved password if prompted.
 
 1. Create the elasticsearch server:
     ```bash
@@ -110,11 +109,11 @@ Before you go ahead make sure you have [docker](https://docker.com/) installed a
     ```bash
     printenv <env-var-name>
     ```
-- To query Database using Django models
+- To query the database using Django models
     ```bash
     python3 manage.py dbshell
     ```
-- Make Django Migrations
+- Make Django migrations
     ```bash
     python3 manage.py makemigrations
     ```

diff --git a/scripts/get_ensembl_metadata.py b/scripts/get_ensembl_metadata.py
@@ -10,16 +10,16 @@ def get_taxon_metadata(db_connection):
     """
     The function connects with ensembl My sql database
     and runs a sql query to retrieve data needed to load
-    into django models as fixtures. 
+    into django models as fixtures.
 
     Parameters:
     db_connection (sqlalchemy.create_engine): A sqlalchemy engine.
 
     Returns:
-    pandas dataframe (pd.DataFrame): tabluar data.
+    pandas dataframe (pd.DataFrame): tabular data.
 
     """
-    
+
     query = f"""select distinct taxonomy_id ,o.name ,url_name 
                     ,display_name ,scientific_name ,strain
                     from organism o

diff --git a/scripts/get_metazoa_species.py b/scripts/get_metazoa_species.py
@@ -9,7 +9,6 @@
 
 
 def get_taxon_ids(url):
-
     """
     The function scrapes the metazoa taxonomy ids data
     from a fixed url: https://metazoa.ensembl.org/species.html
@@ -19,7 +18,7 @@ def get_taxon_ids(url):
     that belong to metazoa.
 
     """
-        
+
     response = requests.get(url)
     soup = BeautifulSoup(response.text, "lxml")
 
@@ -33,17 +32,17 @@ def get_taxon_ids(url):
 
 def get_taxon_tree(taxon_ids, db_engine):
     """
-    The function connects with ensembl My sql database
+    The function connects with ensembl MySQL database
     and runs a sql query to retrieve taxon tree data needed to load
-    into django models as fixtures. 
+    into django models as fixtures.
 
     Parameters:
     taxon_ids (List): list of taxonomy ids for which entire tree structures
     needs to be queried
     db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
 
     Returns:
-    pandas dataframe (pd.DataFrame): tabluar data.
+    pandas dataframe (pd.DataFrame): tabular data.
 
     """
 

diff --git a/scripts/get_phrases.py b/scripts/get_phrases.py
@@ -20,7 +20,7 @@ def get_taxon_ids(url):
     that belong to metazoa.
 
     """
-        
+
     response = requests.get(url)
     soup = BeautifulSoup(response.text, "lxml")
 
@@ -34,7 +34,7 @@ def get_taxon_ids(url):
 
 def preprocess_name(text):
     """
-    Does some basic text preprocessing like 
+    Does some basic text preprocessing like
     removing special characters, extra spaces, etc.
     """
     name = text["name"]
@@ -53,17 +53,17 @@ def preprocess_name(text):
 
 def get_taxon_names(taxon_ids, db_conn):
     """
-    The function connects with ensembl My sql database
-    and runs a sql query to retrieve taxon names like synonyms, common names, 
-    scientif names and converts into elastic search synonym file format. 
+    The function connects with ensembl MySQL database
+    and runs a sql query to retrieve taxon names like synonyms, common names,
+    scientific names and converts into elastic search synonym file format.
 
     Parameters:
     taxon_ids (List): list of taxonomy ids for which entire tree structures
     needs to be queried
     db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
 
     Returns:
-    pandas dataframe (pd.DataFrame): tabluar data.
+    pandas dataframe (pd.DataFrame): tabular data.
 
     """
 

diff --git a/scripts/get_synonyms.py b/scripts/get_synonyms.py
@@ -20,7 +20,7 @@ def get_taxon_ids(url):
     that belong to metazoa.
 
     """
-        
+
     response = requests.get(url)
     soup = BeautifulSoup(response.text, "lxml")
 
@@ -34,10 +34,10 @@ def get_taxon_ids(url):
 
 def preprocess_name(text):
     """
-    Does some basic text preprocessing like 
+    Does some basic text preprocessing like
     removing special characters, extra spaces, etc.
     """
-      
+
     name = text["name"]
     if text["name_class"] != "scientific name":
         name = re.sub(r"[,.;@#?!&$\(\)]+\ *", " ", name)
@@ -49,20 +49,20 @@ def preprocess_name(text):
 
 def get_taxon_names(taxon_ids, db_conn):
     """
-    The function connects with ensembl My sql database
-    and runs a sql query to retrieve taxon names like synonyms, common names, 
-    scientif names and converts into elastic search synonym file format. 
+    The function connects with ensembl MySQL database
+    and runs a sql query to retrieve taxon names like synonyms, common names,
+    scientific names and converts into elastic search synonym file format.
 
     Parameters:
     taxon_ids (List): list of taxonomy ids for which entire tree structures
     needs to be queried
     db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
 
     Returns:
-    pandas dataframe (pd.DataFrame): tabluar data.
+    pandas dataframe (pd.DataFrame): tabular data.
 
     """
-    
+
     query_df = pd.DataFrame()
     for i in range(len(taxon_ids[:])):
         taxon_id = taxon_ids[i]

diff --git a/scripts/get_taxon.py b/scripts/get_taxon.py
@@ -18,7 +18,7 @@ def get_taxon_ids(url):
     that belong to metazoa.
 
     """
-        
+
     response = requests.get(url)
     soup = BeautifulSoup(response.text, "lxml")
 
@@ -32,21 +32,20 @@ def get_taxon_ids(url):
 
 def get_taxon_tree(taxon_ids, db_engine):
     """
-    The function connects with ensembl My sql database
+    The function connects with ensembl MySQL database
     and runs a sql query to retrieve taxon tree data needed to load
-    into django models as fixtures. 
+    into django models as fixtures.
 
     Parameters:
     taxon_ids (List): list of taxonomy ids for which entire tree structures
     needs to be queried
     db_engine (sqlalchemy.create_engine): A sqlalchemy engine.
 
     Returns:
-    pandas dataframe (pd.DataFrame): tabluar data.
+    pandas dataframe (pd.DataFrame): tabular data.
 
     """
 
-
     tree_df = pd.DataFrame()
     for i in range(len(taxon_ids[:])):
         taxon_id = taxon_ids[i]
@@ -76,7 +75,14 @@ def get_taxon_tree(taxon_ids, db_engine):
 
     # get data json for taxon_search.NCBITaxaNode model
     pk_col = ["taxon_id"]
-    field_col = ["parent_id", "rank", "genbank_hidden_flag", "left_index", "right_index", "root_id"]
+    field_col = [
+        "parent_id",
+        "rank",
+        "genbank_hidden_flag",
+        "left_index",
+        "right_index",
+        "root_id",
+    ]
     m1_df = metazoa_df[pk_col + field_col].drop_duplicates()
 
     m1_df["model"] = "taxon_search.NCBITaxaNode"

diff --git a/scripts/get_taxon_flat.py b/scripts/get_taxon_flat.py
@@ -33,20 +33,20 @@ def get_taxon_ids(url):
 
 def get_taxon_tree_flat(taxon_ids, db_conn):
     """
-    The function connects with ensembl My sql database
+    The function connects with ensembl MySQL database
     and runs a sql query to retrieve taxon tree data needed to load
-    into django models as fixtures. 
+    into django models as fixtures.
 
     Parameters:
     taxon_ids (List): list of taxonomy ids for which entire tree structures
     needs to be queried
     db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
 
     Returns:
-    pandas dataframe (pd.DataFrame): tabluar data.
+    pandas dataframe (pd.DataFrame): tabular data.
 
     """
-        
+
     unique_taxons = list(set(taxon_ids))
 
     tree_df = pd.DataFrame()
@@ -87,7 +87,6 @@ def get_taxon_tree_flat(taxon_ids, db_conn):
     all_ids = metazoa_ids + add_ids
     metazoa_df = get_taxon_tree_flat(all_ids, db_conn)
 
-
     # the below code converts the dataframe into json format
     # required by django to load as fixtures.
     pk_col = []

diff --git a/src/index_documents.py b/src/index_documents.py
@@ -22,7 +22,11 @@ def __init__(self):
 
     def add_arguments(self, parser):
         parser.add_argument(
-            "-b", "--batch-size", dest="batch_size", type=int, help="Number of items to index at once."
+            "-b",
+            "--batch-size",
+            dest="batch_size",
+            type=int,
+            help="Number of items to index at once.",
         )
         parser.add_argument(
             "-r",
@@ -32,12 +36,26 @@ def add_arguments(self, parser):
             help="Remove objects from the index that are no longer present in \
                   the database.",
         )
-        parser.add_argument("-i", "--index", dest="index", type=str, help="Specify which index to update.")
         parser.add_argument(
-            "-c", "--clear_index", action="store_true", default=False, help="Clear and rebuild index."
+            "-i",
+            "--index",
+            dest="index",
+            type=str,
+            help="Specify which index to update.",
         )
         parser.add_argument(
-            "-a", "--age", dest="age", default=0, help="Number of hours back to consider objects new."
+            "-c",
+            "--clear_index",
+            action="store_true",
+            default=False,
+            help="Clear and rebuild index.",
+        )
+        parser.add_argument(
+            "-a",
+            "--age",
+            dest="age",
+            default=0,
+            help="Number of hours back to consider objects new.",
         )
 
     def handle(self, *args, **options):

diff --git a/src/taxon_search/documents.py b/src/taxon_search/documents.py
@@ -27,13 +27,11 @@
     synonyms=load_synonym_file(ph_file_path),
 )
 
-# Elastic search index time analyzer to be used while indexing documents.  
+# Elastic search index time analyzer to be used while indexing documents.
 index_analyzer = analyzer(
     "index_analyzer",
     tokenizer="standard",
-    filter=["lowercase", "stop", 
-            autophrase_syn_filter, 
-            synonym_token_filter],
+    filter=["lowercase", "stop", autophrase_syn_filter, synonym_token_filter],
 )
 
 #### Define Ensembl Taxonomy Flat on elastic search with appropiate settings.
@@ -45,15 +43,16 @@
 @taxon_flat_index.document
 class TaxonFlatDocument(Document):
     """
-    Elastic Search Document Model for Index data from the 
+    Elastic Search Document Model for Index data from the
     NCBITaxonFlat Django Model.
 
-    Auto Indexing signals are disabled. For re-indexing or updating the 
+    Auto Indexing signals are disabled. For re-indexing or updating the
     index, run the below command in `src` directory.
 
     python3 manage.py search_index --rebuild
 
     """
+
     taxon_id = fields.IntegerField(attr="taxon_id")
     parent_id = fields.IntegerField(attr="parent_id")
     left_index = fields.IntegerField(attr="left_index")

diff --git a/src/taxon_search/migrations/0001_initial.py b/src/taxon_search/migrations/0001_initial.py
@@ -22,7 +22,8 @@ class Migration(migrations.Migration):
                 (
                     "parent_id",
                     models.ForeignKey(
-                        on_delete=django.db.models.deletion.CASCADE, to="taxon_search.ncbitaxanode"
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="taxon_search.ncbitaxanode",
                     ),
                 ),
             ],
@@ -36,7 +37,10 @@ class Migration(migrations.Migration):
                 (
                     "id",
                     models.BigAutoField(
-                        auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
                     ),
                 ),
                 ("name", models.CharField(db_index=True, max_length=500)),

diff --git a/src/taxon_search/migrations/0003_alter_ncbitaxanode_parent_id.py b/src/taxon_search/migrations/0003_alter_ncbitaxanode_parent_id.py
@@ -6,15 +6,20 @@
 
 class Migration(migrations.Migration):
     dependencies = [
-        ("taxon_search", "0002_rename_taxaname_taxon_id_ncbitaxaname_taxon_id_and_more"),
+        (
+            "taxon_search",
+            "0002_rename_taxaname_taxon_id_ncbitaxaname_taxon_id_and_more",
+        ),
     ]
 
     operations = [
         migrations.AlterField(
             model_name="ncbitaxanode",
             name="parent_id",
             field=models.ForeignKey(
-                default=0, on_delete=django.db.models.deletion.CASCADE, to="taxon_search.ncbitaxanode"
+                default=0,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="taxon_search.ncbitaxanode",
             ),
         ),
     ]
diff --git a/src/taxon_search/migrations/0006_ensemblmetadata.py b/src/taxon_search/migrations/0006_ensemblmetadata.py
@@ -15,7 +15,10 @@ class Migration(migrations.Migration):
                 (
                     "id",
                     models.BigAutoField(
-                        auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
                     ),
                 ),
                 ("taxonomy_id", models.IntegerField()),

diff --git a/src/taxon_search/migrations/0008_ncbitaxaflat.py b/src/taxon_search/migrations/0008_ncbitaxaflat.py
@@ -15,7 +15,10 @@ class Migration(migrations.Migration):
                 (
                     "id",
                     models.BigAutoField(
-                        auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
                     ),
                 ),
                 ("taxon_id", models.IntegerField()),