Skip to content

Commit

Permalink
Merge pull request #69 from eQTL-Catalogue/dev
Browse files Browse the repository at this point in the history
API v2
  • Loading branch information
jdhayhurst authored Jun 13, 2023
2 parents bdbef2c + f47b9f4 commit 84751d7
Show file tree
Hide file tree
Showing 250 changed files with 2,412 additions and 19,859 deletions.
6 changes: 6 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ env.*
*.tar
*.gz
*.tar.gz
*.tbi
*.png*

# hidden files
.*

# Python specific files
.cache
__pycache__
Expand Down
32 changes: 22 additions & 10 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ variables:

# generate the docs with sphinx
sphinxdocs:
image: $CI_REGISTRY_IMAGE:latest
image: $CI_REGISTRY_IMAGE:dev
stage: docs
script:
- mkdir logs
Expand All @@ -28,6 +28,7 @@ sphinxdocs:
- docs/build/html
only:
- master
- dev

# always build an image tagged with the commit SHA from master
build:
Expand All @@ -41,6 +42,19 @@ build:
only:
- master

build_dev:
stage: build
script:
- echo "$DOCKER_HUB_PASSWORD" > dhpw.txt
- docker login -u "${DOCKER_HUB_USER}" --password-stdin < dhpw.txt docker.io
- docker pull $CI_REGISTRY_IMAGE:latest
- docker build --cache-from $CI_REGISTRY_IMAGE:latest -t $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA .
- docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:dev
- docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
- docker push $CI_REGISTRY_IMAGE:dev
only:
- dev


build_docs:
stage: build
Expand All @@ -54,7 +68,7 @@ build_docs:
- sphinxdocs
only:
- master

- dev

# if a tag is created create an images with the tag and a latest
build_release:
Expand Down Expand Up @@ -97,15 +111,15 @@ deploy_staging:
- echo "Deploy to staging server"
- mkdir -p /root/.kube
- echo ${FALLBACK_KUBECONF} | base64 -d > /root/.kube/config
- helm init
- helm delete --purge eqtl-sumstats || true
- sleep 30
- helm install --name eqtl-sumstats --set k8Namespace=eqtl,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER,volume.data.path=$NFS_PATH,replicaCount=1 ./eqtlss/ --wait
- helm init --stable-repo-url https://charts.helm.sh/stable
- helm delete --purge eqtl-sumstats-dev || true
- helm install --name eqtl-sumstats-dev --set k8Namespace=dev-gwas,service.name=eqtl-rest-api-dev,staticService.name=eqtl-api-docs-dev,staticService.path=/api-docs,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER,volume.data.path=$NFS_PATH_DEV,replicaCount=1 ./eqtlss/ --wait
environment:
name: dev
when: manual
only:
- master
- dev


deploy_fallback:
Expand All @@ -115,9 +129,8 @@ deploy_fallback:
- echo "Deploy to fallback server"
- mkdir -p /root/.kube
- echo ${FALLBACK_KUBECONF} | base64 -d > /root/.kube/config
- helm init
- helm init --stable-repo-url https://charts.helm.sh/stable
- helm delete --purge eqtl-sumstats || true
- sleep 30
- helm install --name eqtl-sumstats --set k8Namespace=eqtl,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER,volume.data.path=$NFS_PATH ./eqtlss/ --wait
environment:
name: fallback
Expand All @@ -133,9 +146,8 @@ deploy_prod:
- echo "Deploy to production server"
- mkdir -p /root/.kube
- echo ${PROD_KUBECONF} | base64 -d > /root/.kube/config
- helm init
- helm init --stable-repo-url https://charts.helm.sh/stable
- helm delete --purge eqtl-sumstats || true
- sleep 30
- helm install --name eqtl-sumstats --set k8Namespace=eqtl,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER_PROD,volume.data.path=$NFS_PATH_PROD ./eqtlss/ --wait
environment:
name: production
Expand Down
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
FROM hdfgroup/h5py:2.7.0
FROM hdfgroup/hdf5lib:1.10.6


COPY requirements.txt /application/
WORKDIR /application
RUN pip install --upgrade pip
RUN pip install -r requirements.txt --ignore-installed six

COPY setup.py .
Expand All @@ -19,11 +20,10 @@ ENV EQSS_CONFIG "/application/config/properties.json"
ENV GACC_LOGS "logs/gaccess.log"
ENV GERR_LOGS "logs/gerror.log"
ENV GUNI_LOGS "logs/glogger.log"

ENV USER docker

ENV UID 1000
ENV GID 1000
ENV HDF5_ROOT_DIR "/files/output/"

RUN addgroup --gid "$GID" "$USER" \
&& adduser \
Expand Down
170 changes: 42 additions & 128 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,157 +4,71 @@

EQTL Summary statistics with HDF5

The concept is to leverage the fast query times and multidimensional indexing capabilities of HDF5 to enable fast, useful querying of very large summary statistics datasets. There are loading scripts to convert TSV summary statistics to HDF5 using [PyTables](https://www.pytables.org/). Command line utilities enable the querying of HDF5, and a Python Flask app to serves the data via a REST API. This is conterised and deployed to the cloud using Kubernetes.
The concept is to leverage the fast query times and multidimensional indexing capabilities of HDF5 to enable fast, useful querying of very large summary statistics datasets. There are loading scripts to convert TSV summary statistics to HDF5 using [PyTables](https://www.pytables.org/). A Python fast-api app to serves the data via a REST API. This is conterised and deployed to the cloud using Kubernetes.

This REST API is for facilitating a filtered request of eQTL association data.

# Local installation - using conda and pip
## API v2

- Clone the repository
- `git clone https://github.com/EBISPOT/SumStats.git`
- `cd SumStats`
- Create conda environment (installs HDF5 lib and other dependencies)
- `conda env create -f sumstats.yml`
- `conda activate sumstats`
- pip install the sumstats package - this will install pytables, numpy, flask, gunicorn - and sumstats
- `pip install -r requirements.txt`
- `pip install .`
Each study in the catalogue is split by QTL context and these splits are
assigned their own dataset IDs (QTD#). Datasets can be browsed at the `/datasets`
endpoint.

To retrieve the summary statistics for a dataset, use the
`/datasets/<DATASETID>/associations` endpoint and apply
any required filters.

# Setting properties
Under the `config` directory you will find the files that are responsible for setting the runtime properties.
## API v1

`properties.py` is the default one. It can be altered but you will need to re-install the package in oreder for the changes to take effect.
This will be deprecated and is maintained only for existing integrations.

`properties.json` can be edited and passed as a an environmental variable as: `export EQSS_CONFIG=<path to json config>` when running:
- `gunicorn -b <host:port> --chdir sumstats/server --access-logfile <path to access log file> --error-logfile <path to error log file> app:app [--log-level <log level>]` to run the API
- `eqtl-search` to search the database via command line
- `eqtl-explore` to explore what is saved in the database via command line

# Installation

# Loading
Requires the HDF5 library. Easiest way to manage this is to use the docker image. There's a public image: ebispot/eqtl-sumstats-api or for development you can clone this repo and run `docker build -t ebispot/eqtl-sumstats-api:local .`

Data loading means converting tsv data to HDF5. It involves merging in data from FIVE sources - i) summary statistics, ii) variant data, iii) expression data, iv) phenotype metadata, v) rsids from dbsnp. These data are split and joined to produce CSV and HDF5 files, where each file represents a study X qtl group X tissue X chromosome.
Once the package is installed you can load studies and search for studies using the command line toolkit.
# Running
## The web app
`docker run -v <path to API_v2>:/files/output -p 8000:8000 -e HDF5_ROOT_DIR:/files/output ebispot/eqtl-sumstats-api:local uvicorn sumstats.main:app --host 0.0.0.0`

```
$ eqtl-load --help
usage: eqtl-load [-h] [-f F] [-csv CSV] [-var VAR] [-phen PHEN] [-expr EXPR]
[-study STUDY] [-qtl_group QTL_GROUP] [-quant QUANT]
[-tissue TISSUE] [-chr CHR] -loader {study,trait,study_info}
[-tissue_ont TISSUE_ONT] [-treatment TREATMENT]
[-treatment_ont TREATMENT_ONT]
optional arguments:
-h, --help show this help message and exit
-f F The path to the summary statistics file to be
processed
-csv CSV The path to the csv OUT file
-var VAR The path to the variant/genotype metadata file
-phen PHEN The path to the trait/phenotype metadata file
-expr EXPR The path to the gene expression file
-study STUDY The study identifier
-qtl_group QTL_GROUP The qtl group e.g. "LCL"
-quant QUANT The quantification method e.g. "gene counts"
-tissue TISSUE The tissue
-chr CHR The chromosome the data belongs to
-loader {study,trait,study_info}
The loader
-tissue_ont TISSUE_ONT
The tissue ontology term
-treatment TREATMENT The treatment
-treatment_ont TREATMENT_ONT
The treatment ontology term
```

After conversion to HDF5, the file should be indexed (the fields to index can be modified in the [common_constants.py](sumstats/common_constants.py) file):
Visit Swagger docs are here: http://127.0.0.1:8000/eqtl/api/docs

```
$ eqtl-reindex --help
usage: eqtl-reindex [-h] -f F
optional arguments:
-h, --help show this help message and exit
-f F The path to the HDF5 file to be processed
```

It is recommended to 'repack' the HDF5 to save disk space. Utilise the PyTables [`ptrepack`](https://www.pytables.org/usersguide/utilities.html#ptrepack) command line utility
## Data loading

Data loading means converting tsv data to HDF5.

# Exploring
To explore the contents of the database you can use the following commands:
### Convert sumstats TSVs to HDF5

Requires nextflow and docker installation, or singularity if run `-with-singularity` instead of `-with-docker`
The TSVs are in a single dir (--tsv_dir).
The HDF5 will be written to --hdf5_dir.
```
$ eqtl-explore --help
usage: eqtl-explore [-h] [-molecular_phenotypes]
[-molecular_phenotype MOLECULAR_PHENOTYPE] [-studies]
[-study STUDY] [-tissues] [-tissue TISSUE] [-chromosomes]
[-genes]
optional arguments:
-h, --help show this help message and exit
-molecular_phenotypes
List all the molecular_phenotypes
-molecular_phenotype MOLECULAR_PHENOTYPE
List all the studies for a molecular_phenotype
-studies List all the studies
-study STUDY Will list 'trait: study' if it exists
-tissues List all the tissues
-tissue TISSUE Will list 'study: tissue' if it exists
-chromosomes Will list all the chromosomes
-genes List all the genes
nextflow run sumstats/api_v2/workflows/tsv2hdf.nf --tsv_dir ./tsv/ --hdf5_dir ./hdf5/ -with-docker docker://ebispot/eqtl-sumstats-api:local
```


# Searching
To retrieve data from the database you can use the following commands:
The nextflow pipeline runs a CLI:
```
$ eqtl-search --help
usage: eqtl-search [-h] [-path PATH] [-all] [-start START] [-size SIZE]
[-trait TRAIT] [-gene GENE] [-study STUDY] [-tissue TISSUE]
[-qtl_group QTL_GROUP] [-snp SNP] [-chr CHR] [-pval PVAL]
[-bp BP] [-quant_method {ge,tx,txrev,microarray,exon}]
[-paginate [PAGINATE]]
tsv2hdf --help
usage: tsv2hdf [-h] -t T -hdf HDF -type {data,metadata}
optional arguments:
-h, --help show this help message and exit
-path PATH Full path to the dir where the h5files will be stored
-all Use argument if you want to search for all
associations
-start START Index of the first association retrieved
-size SIZE Number of retrieved associations
-trait TRAIT The trait I am looking for
-gene GENE The gene I am looking for
-study STUDY The study I am looking for
-tissue TISSUE The tissue I am looking for
-qtl_group QTL_GROUP The QTL group/context I am looking for
-snp SNP The SNP I am looking for
-chr CHR The chromosome I am looking for
-pval PVAL Filter by pval threshold: -pval floor:ceil
-bp BP Filter with baise pair location threshold: -bp
floor:ceil
-quant_method {ge,tx,txrev,microarray,exon}
The quantification method
-paginate [PAGINATE] Sets paginate to "False" if you would like to fetch
all associations for your query
-t T tsv path
-hdf HDF hdf5 file label e.g. dataset1
-type {data,metadata}
specify whether it is data or metadata
```
Nextflow pipeline will run the above cli for _each_ TSV (`$tsv_file`). The `$id` is the dataset id:
```
tsv2hdf -t $tsv_file -hdf $id -type data;
```
It will not run for the metadata. To run for the metadata do the following:
`$tsv_file` is the metadata tsv e.g. [here](https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv)
```
tsv2hdf -t $tsv_file -hdf qtl_metadata -type metadata
```
The data will by default be retrieved in batches of 20 snps and their info per query. You can loop through the data using the default size of 20 and updating the start flag as you go: `-start <start>`, or use the flags `-start <start> -size <size>` to specify the bandwith of your retrieval. The default value of `quant_method` is 'ge'. F

There are two more flags that you can use:
1. `-bp floor:ceil` e.g. `-bp 10000:20000000` that specifies the range of the base pair location in the chromosome that you want. Makes sense to use when querying for a chromosome, or a trait/study
2. `-pval floor:ceil` e.g. `-pval 2e-10:5e-5` or `-pval 0.000003:4e-3` that specifies the p-value range of the results.

Note that, if the `output` directory is set by default to `./files/output` in the properties file. If you need to specify the location where it resides, modify the properties.json file and set `export EQSS_CONFIG=<path to json config>` accordingly.

The API uses the same search module and is more thoroughly documented [here](https://www.ebi.ac.uk/eqtl/api-docs/), which will explain the fields and constraints.

# Exposing the API
To expose the API you need to run: `gunicorn -b <host:port> --chdir sumstats/server --access-logfile <path to access log file> --error-logfile <path to error log file> app:app [--log-level <log level>]`

You can set the environmental variable as: `export EQSS_CONFIG=<path to json config>` to change the default properties, such as the directory where all the data is stored (output directory) as explained in all the above sections.

This will spin up the service and make it available on port 8080 (if running via docker, we exposed the port when we spinned up the container).

You should be able to see the API on http://localhost:8080/
## Project structure
There are two versions of the API, v1, and v2. The code is seperated here [sumstats](sumstats), so that when the v1 api is ready to be removed, [sumstats/api_v1](sumstats/api_v1/) can simply be deleted. The [sumstats/main.py](sumstats/main.py) which is where the fast-api app is run, will also need to be updated if v1 is removed.

# Deployment
Kubernetes delpoyment has been configured with gitlab using [.gitlab-ci.yml](.gitlab-ci.yml) and [helm](eqtlss).
4 changes: 2 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ API quick reference

With the exception of genomic region requests, all requests for associations can all be made using the ``/associations`` endpoint, adding and combining parameters as needed for filtering.

.. qrefflask:: sumstats.server.app:app
.. qrefflask:: sumstats.api_v1.server.app:app
:undoc-static:
:order: path

Expand Down Expand Up @@ -363,6 +363,6 @@ A ``GET`` request is used to access the API.
API detailed reference
======================

.. autoflask:: sumstats.server.app:app
.. autoflask:: sumstats.api_v1.server.app:app
:undoc-static:
:order: path
4 changes: 3 additions & 1 deletion eqtlss/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ spec:
value: {{.Values.image.gid}}
- name: USER
value: {{.Values.image.user}}
- name: HDF5_ROOT_DIR
value: {{.Values.image.dataMountPath}}
template:
metadata:
labels:
Expand All @@ -25,7 +27,7 @@ spec:
containers:
- name: {{.Values.service.name}}
image: "{{.Values.image.repository}}:{{.Values.image.tag}}"
command: {{.Values.service.gunicornCmd}}
command: {{.Values.service.uvicornCmd}}
imagePullPolicy: {{.Values.image.pullPolicy}}
resources:
requests:
Expand Down
1 change: 1 addition & 0 deletions eqtlss/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ service:
port: 8000
path: /eqtl/api(/|$)(.*)
gunicornCmd: '["gunicorn", "-b", "0.0.0.0:8000", "sumstats.server.app:app","--log-level=debug","--access-logfile=logs/ss_access.log","--error-logfile=logs/ss_error.log", "--workers=4"]'
uvicornCmd: '["uvicorn", "sumstats.main:app", "--host", "0.0.0.0", "--port", "8000", "--log-level=debug"]'

staticService:
replicaCount: 1
Expand Down
Binary file removed nginx/build/doctrees/environment.pickle
Binary file not shown.
Binary file removed nginx/build/doctrees/index.doctree
Binary file not shown.
4 changes: 0 additions & 4 deletions nginx/build/html/.buildinfo

This file was deleted.

Empty file removed nginx/build/html/.nojekyll
Empty file.
Loading

0 comments on commit 84751d7

Please sign in to comment.