Skip to content

Commit

Permalink
run black and fix a few typos
Browse files Browse the repository at this point in the history
  • Loading branch information
JAlvarezJarreta committed Sep 29, 2023
1 parent c82074c commit 0fd0b54
Show file tree
Hide file tree
Showing 19 changed files with 129 additions and 87 deletions.
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ Before you go ahead make sure you have [Python 3.10](https://www.python.org/down

### How to install and setup an Elasticsearch server locally

Before you go ahead make sure you have [docker](https://docker.com/) installed and available in your system. The below steps are required only for the initial setup. To re-run an existing docker container, run `docker start my_es_server` and enter the previously saved password if prompted.

Before you go ahead make sure you have [docker](https://docker.com/) installed and available in your system. The steps below are required only for the initial setup. To re-run an existing docker container, run `docker start my_es_server` and enter the previously saved password if prompted.

1. Create the elasticsearch server:
```bash
Expand Down Expand Up @@ -110,11 +109,11 @@ Before you go ahead make sure you have [docker](https://docker.com/) installed a
```bash
printenv <env-var-name>
```
- To query Database using Django models
- To query the database using Django models
```bash
python3 manage.py dbshell
```
- Make Django Migrations
- Make Django migrations
```bash
python3 manage.py makemigrations
```
Expand Down
6 changes: 3 additions & 3 deletions scripts/get_ensembl_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ def get_taxon_metadata(db_connection):
"""
The function connects with ensembl My sql database
and runs a sql query to retrieve data needed to load
into django models as fixtures.
into django models as fixtures.
Parameters:
db_connection (sqlalchemy.create_engine): A sqlalchemy engine.
Returns:
pandas dataframe (pd.DataFrame): tabluar data.
pandas dataframe (pd.DataFrame): tabular data.
"""

query = f"""select distinct taxonomy_id ,o.name ,url_name
,display_name ,scientific_name ,strain
from organism o
Expand Down
9 changes: 4 additions & 5 deletions scripts/get_metazoa_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@


def get_taxon_ids(url):

"""
The function scrapes the metazoa taxonomy ids data
from a fixed url: https://metazoa.ensembl.org/species.html
Expand All @@ -19,7 +18,7 @@ def get_taxon_ids(url):
that belong to metazoa.
"""

response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")

Expand All @@ -33,17 +32,17 @@ def get_taxon_ids(url):

def get_taxon_tree(taxon_ids, db_engine):
"""
The function connects with ensembl My sql database
The function connects with ensembl MySQL database
and runs a sql query to retrieve taxon tree data needed to load
into django models as fixtures.
into django models as fixtures.
Parameters:
taxon_ids (List): list of taxonomy ids for which entire tree structures
needs to be queried
db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
Returns:
pandas dataframe (pd.DataFrame): tabluar data.
pandas dataframe (pd.DataFrame): tabular data.
"""

Expand Down
12 changes: 6 additions & 6 deletions scripts/get_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_taxon_ids(url):
that belong to metazoa.
"""

response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")

Expand All @@ -34,7 +34,7 @@ def get_taxon_ids(url):

def preprocess_name(text):
"""
Does some basic text preprocessing like
Does some basic text preprocessing like
removing special characters, extra spaces, etc.
"""
name = text["name"]
Expand All @@ -53,17 +53,17 @@ def preprocess_name(text):

def get_taxon_names(taxon_ids, db_conn):
"""
The function connects with ensembl My sql database
and runs a sql query to retrieve taxon names like synonyms, common names,
scientif names and converts into elastic search synonym file format.
The function connects with ensembl MySQL database
and runs a sql query to retrieve taxon names like synonyms, common names,
scientific names and converts into elastic search synonym file format.
Parameters:
taxon_ids (List): list of taxonomy ids for which entire tree structures
needs to be queried
db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
Returns:
pandas dataframe (pd.DataFrame): tabluar data.
pandas dataframe (pd.DataFrame): tabular data.
"""

Expand Down
16 changes: 8 additions & 8 deletions scripts/get_synonyms.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_taxon_ids(url):
that belong to metazoa.
"""

response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")

Expand All @@ -34,10 +34,10 @@ def get_taxon_ids(url):

def preprocess_name(text):
"""
Does some basic text preprocessing like
Does some basic text preprocessing like
removing special characters, extra spaces, etc.
"""

name = text["name"]
if text["name_class"] != "scientific name":
name = re.sub(r"[,.;@#?!&$\(\)]+\ *", " ", name)
Expand All @@ -49,20 +49,20 @@ def preprocess_name(text):

def get_taxon_names(taxon_ids, db_conn):
"""
The function connects with ensembl My sql database
and runs a sql query to retrieve taxon names like synonyms, common names,
scientif names and converts into elastic search synonym file format.
The function connects with ensembl MySQL database
and runs a sql query to retrieve taxon names like synonyms, common names,
scientific names and converts into elastic search synonym file format.
Parameters:
taxon_ids (List): list of taxonomy ids for which entire tree structures
needs to be queried
db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
Returns:
pandas dataframe (pd.DataFrame): tabluar data.
pandas dataframe (pd.DataFrame): tabular data.
"""

query_df = pd.DataFrame()
for i in range(len(taxon_ids[:])):
taxon_id = taxon_ids[i]
Expand Down
18 changes: 12 additions & 6 deletions scripts/get_taxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_taxon_ids(url):
that belong to metazoa.
"""

response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")

Expand All @@ -32,21 +32,20 @@ def get_taxon_ids(url):

def get_taxon_tree(taxon_ids, db_engine):
"""
The function connects with ensembl My sql database
The function connects with ensembl MySQL database
and runs a sql query to retrieve taxon tree data needed to load
into django models as fixtures.
into django models as fixtures.
Parameters:
taxon_ids (List): list of taxonomy ids for which entire tree structures
needs to be queried
db_engine (sqlalchemy.create_engine): A sqlalchemy engine.
Returns:
pandas dataframe (pd.DataFrame): tabluar data.
pandas dataframe (pd.DataFrame): tabular data.
"""


tree_df = pd.DataFrame()
for i in range(len(taxon_ids[:])):
taxon_id = taxon_ids[i]
Expand Down Expand Up @@ -76,7 +75,14 @@ def get_taxon_tree(taxon_ids, db_engine):

# get data json for taxon_search.NCBITaxaNode model
pk_col = ["taxon_id"]
field_col = ["parent_id", "rank", "genbank_hidden_flag", "left_index", "right_index", "root_id"]
field_col = [
"parent_id",
"rank",
"genbank_hidden_flag",
"left_index",
"right_index",
"root_id",
]
m1_df = metazoa_df[pk_col + field_col].drop_duplicates()

m1_df["model"] = "taxon_search.NCBITaxaNode"
Expand Down
9 changes: 4 additions & 5 deletions scripts/get_taxon_flat.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,20 @@ def get_taxon_ids(url):

def get_taxon_tree_flat(taxon_ids, db_conn):
"""
The function connects with ensembl My sql database
The function connects with ensembl MySQL database
and runs a sql query to retrieve taxon tree data needed to load
into django models as fixtures.
into django models as fixtures.
Parameters:
taxon_ids (List): list of taxonomy ids for which entire tree structures
needs to be queried
db_conn (sqlalchemy.create_engine): A sqlalchemy engine.
Returns:
pandas dataframe (pd.DataFrame): tabluar data.
pandas dataframe (pd.DataFrame): tabular data.
"""

unique_taxons = list(set(taxon_ids))

tree_df = pd.DataFrame()
Expand Down Expand Up @@ -87,7 +87,6 @@ def get_taxon_tree_flat(taxon_ids, db_conn):
all_ids = metazoa_ids + add_ids
metazoa_df = get_taxon_tree_flat(all_ids, db_conn)


# the below code converts the dataframe into json format
# required by django to load as fixtures.
pk_col = []
Expand Down
26 changes: 22 additions & 4 deletions src/index_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ def __init__(self):

def add_arguments(self, parser):
parser.add_argument(
"-b", "--batch-size", dest="batch_size", type=int, help="Number of items to index at once."
"-b",
"--batch-size",
dest="batch_size",
type=int,
help="Number of items to index at once.",
)
parser.add_argument(
"-r",
Expand All @@ -32,12 +36,26 @@ def add_arguments(self, parser):
help="Remove objects from the index that are no longer present in \
the database.",
)
parser.add_argument("-i", "--index", dest="index", type=str, help="Specify which index to update.")
parser.add_argument(
"-c", "--clear_index", action="store_true", default=False, help="Clear and rebuild index."
"-i",
"--index",
dest="index",
type=str,
help="Specify which index to update.",
)
parser.add_argument(
"-a", "--age", dest="age", default=0, help="Number of hours back to consider objects new."
"-c",
"--clear_index",
action="store_true",
default=False,
help="Clear and rebuild index.",
)
parser.add_argument(
"-a",
"--age",
dest="age",
default=0,
help="Number of hours back to consider objects new.",
)

def handle(self, *args, **options):
Expand Down
11 changes: 5 additions & 6 deletions src/taxon_search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,11 @@
synonyms=load_synonym_file(ph_file_path),
)

# Elastic search index time analyzer to be used while indexing documents.
# Elastic search index time analyzer to be used while indexing documents.
index_analyzer = analyzer(
"index_analyzer",
tokenizer="standard",
filter=["lowercase", "stop",
autophrase_syn_filter,
synonym_token_filter],
filter=["lowercase", "stop", autophrase_syn_filter, synonym_token_filter],
)

#### Define Ensembl Taxonomy Flat on elastic search with appropiate settings.
Expand All @@ -45,15 +43,16 @@
@taxon_flat_index.document
class TaxonFlatDocument(Document):
"""
Elastic Search Document Model for Index data from the
Elastic Search Document Model for Index data from the
NCBITaxonFlat Django Model.
Auto Indexing signals are disabled. For re-indexing or updating the
Auto Indexing signals are disabled. For re-indexing or updating the
index, run the below command in `src` directory.
python3 manage.py search_index --rebuild
"""

taxon_id = fields.IntegerField(attr="taxon_id")
parent_id = fields.IntegerField(attr="parent_id")
left_index = fields.IntegerField(attr="left_index")
Expand Down
8 changes: 6 additions & 2 deletions src/taxon_search/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class Migration(migrations.Migration):
(
"parent_id",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="taxon_search.ncbitaxanode"
on_delete=django.db.models.deletion.CASCADE,
to="taxon_search.ncbitaxanode",
),
),
],
Expand All @@ -36,7 +37,10 @@ class Migration(migrations.Migration):
(
"id",
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("name", models.CharField(db_index=True, max_length=500)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,20 @@

class Migration(migrations.Migration):
dependencies = [
("taxon_search", "0002_rename_taxaname_taxon_id_ncbitaxaname_taxon_id_and_more"),
(
"taxon_search",
"0002_rename_taxaname_taxon_id_ncbitaxaname_taxon_id_and_more",
),
]

operations = [
migrations.AlterField(
model_name="ncbitaxanode",
name="parent_id",
field=models.ForeignKey(
default=0, on_delete=django.db.models.deletion.CASCADE, to="taxon_search.ncbitaxanode"
default=0,
on_delete=django.db.models.deletion.CASCADE,
to="taxon_search.ncbitaxanode",
),
),
]
5 changes: 4 additions & 1 deletion src/taxon_search/migrations/0006_ensemblmetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ class Migration(migrations.Migration):
(
"id",
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("taxonomy_id", models.IntegerField()),
Expand Down
5 changes: 4 additions & 1 deletion src/taxon_search/migrations/0008_ncbitaxaflat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ class Migration(migrations.Migration):
(
"id",
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("taxon_id", models.IntegerField()),
Expand Down
Loading

0 comments on commit 0fd0b54

Please sign in to comment.