Merge pull request #69 from eQTL-Catalogue/dev

API v2
eQTL-Catalogue · Jun 13, 2023 · 84751d7 · 84751d7
2 parents bdbef2c + f47b9f4
commit 84751d7
Show file tree

Hide file tree

Showing 250 changed files with 2,412 additions and 19,859 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -17,6 +17,12 @@ env.*
 *.tar
 *.gz
 *.tar.gz
+*.tbi
+*.png*
+
+# hidden files
+.*
+
 # Python specific files
 .cache
 __pycache__

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -16,7 +16,7 @@ variables:
 
 # generate the docs with sphinx
 sphinxdocs:
-  image: $CI_REGISTRY_IMAGE:latest
+  image: $CI_REGISTRY_IMAGE:dev
   stage: docs
   script:
    - mkdir logs
@@ -28,6 +28,7 @@ sphinxdocs:
     - docs/build/html
   only:
    - master
+   - dev
 
 # always build an image tagged with the commit SHA from master
 build:
@@ -41,6 +42,19 @@ build:
   only:
    - master
 
+build_dev:
+  stage: build
+  script:
+   - echo "$DOCKER_HUB_PASSWORD" > dhpw.txt
+   - docker login -u "${DOCKER_HUB_USER}" --password-stdin < dhpw.txt docker.io
+   - docker pull $CI_REGISTRY_IMAGE:latest
+   - docker build --cache-from $CI_REGISTRY_IMAGE:latest -t $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA .
+   - docker tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA $CI_REGISTRY_IMAGE:dev
+   - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+   - docker push $CI_REGISTRY_IMAGE:dev
+  only:
+   - dev
+
 
 build_docs:
   stage: build
@@ -54,7 +68,7 @@ build_docs:
     - sphinxdocs
   only:
    - master
-
+   - dev
 
 # if a tag is created create an images with the tag and a latest
 build_release:
@@ -97,15 +111,15 @@ deploy_staging:
     - echo "Deploy to staging server"
     - mkdir -p /root/.kube
     - echo ${FALLBACK_KUBECONF} | base64 -d > /root/.kube/config
-    - helm init
-    - helm delete --purge eqtl-sumstats || true
-    - sleep 30
-    - helm install --name eqtl-sumstats --set k8Namespace=eqtl,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER,volume.data.path=$NFS_PATH,replicaCount=1 ./eqtlss/ --wait
+    - helm init --stable-repo-url https://charts.helm.sh/stable
+    - helm delete --purge eqtl-sumstats-dev || true
+    - helm install --name eqtl-sumstats-dev --set k8Namespace=dev-gwas,service.name=eqtl-rest-api-dev,staticService.name=eqtl-api-docs-dev,staticService.path=/api-docs,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER,volume.data.path=$NFS_PATH_DEV,replicaCount=1 ./eqtlss/ --wait
   environment:
     name: dev
   when: manual
   only:
     - master
+    - dev
 
 
 deploy_fallback:
@@ -115,9 +129,8 @@ deploy_fallback:
     - echo "Deploy to fallback server"
     - mkdir -p /root/.kube
     - echo ${FALLBACK_KUBECONF} | base64 -d > /root/.kube/config
-    - helm init
+    - helm init --stable-repo-url https://charts.helm.sh/stable
     - helm delete --purge eqtl-sumstats || true
-    - sleep 30
     - helm install --name eqtl-sumstats --set k8Namespace=eqtl,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER,volume.data.path=$NFS_PATH ./eqtlss/ --wait
   environment:
     name: fallback
@@ -133,9 +146,8 @@ deploy_prod:
     - echo "Deploy to production server"
     - mkdir -p /root/.kube
     - echo ${PROD_KUBECONF} | base64 -d > /root/.kube/config
-    - helm init
+    - helm init --stable-repo-url https://charts.helm.sh/stable
     - helm delete --purge eqtl-sumstats || true
-    - sleep 30
     - helm install --name eqtl-sumstats --set k8Namespace=eqtl,image.repository=$CI_REGISTRY_IMAGE,image.tag=$CI_COMMIT_SHA,image.uid=$UID,image.gid=$GID,staticService.image.repository=$CI_REGISTRY_DOCS_IMAGE,staticService.image.tag=$CI_COMMIT_SHA,volume.data.nfsServer=$NFS_SERVER_PROD,volume.data.path=$NFS_PATH_PROD ./eqtlss/ --wait
   environment:
     name: production

diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,9 @@
-FROM hdfgroup/h5py:2.7.0
+FROM hdfgroup/hdf5lib:1.10.6
 
 
 COPY requirements.txt /application/
 WORKDIR /application
+RUN pip install --upgrade pip
 RUN pip install -r requirements.txt --ignore-installed six
 
 COPY setup.py .
@@ -19,11 +20,10 @@ ENV EQSS_CONFIG "/application/config/properties.json"
 ENV GACC_LOGS "logs/gaccess.log"
 ENV GERR_LOGS "logs/gerror.log"
 ENV GUNI_LOGS "logs/glogger.log"
-
 ENV USER docker
-
 ENV UID 1000
 ENV GID 1000
+ENV HDF5_ROOT_DIR "/files/output/"
 
 RUN addgroup --gid "$GID" "$USER" \
   && adduser \

diff --git a/README.md b/README.md
@@ -4,157 +4,71 @@
 
 EQTL Summary statistics with HDF5
 
-The concept is to leverage the fast query times and multidimensional indexing capabilities of HDF5 to enable fast, useful querying of very large summary statistics datasets. There are loading scripts to convert TSV summary statistics to HDF5 using [PyTables](https://www.pytables.org/). Command line utilities enable the querying of HDF5, and a Python Flask app to serves the data via a REST API. This is conterised and deployed to the cloud using Kubernetes.
+The concept is to leverage the fast query times and multidimensional indexing capabilities of HDF5 to enable fast, useful querying of very large summary statistics datasets. There are loading scripts to convert TSV summary statistics to HDF5 using [PyTables](https://www.pytables.org/). A Python fast-api app to serves the data via a REST API. This is conterised and deployed to the cloud using Kubernetes.
 
+This REST API is for facilitating  a filtered request of eQTL association data.
 
-# Local installation - using conda and pip
+## API v2
 
-- Clone the repository
-  - `git clone https://github.com/EBISPOT/SumStats.git`
-  - `cd SumStats`
-- Create conda environment (installs HDF5 lib and other dependencies)
-	- `conda env create -f sumstats.yml`
-	- `conda  activate  sumstats`
-- pip install the sumstats package -  this will install pytables, numpy, flask, gunicorn - and sumstats
-  - `pip install -r requirements.txt`
-  - `pip install .`
+Each study in the catalogue is split by QTL context and these splits are
+assigned their own dataset IDs (QTD#). Datasets can be browsed at the `/datasets` 
+endpoint. 
 
+To retrieve the summary statistics for a dataset, use the 
+`/datasets/<DATASETID>/associations` endpoint and apply 
+any required filters. 
 
-# Setting properties
-Under the `config` directory you will find the files that are responsible for setting the runtime properties.
+## API v1
 
-`properties.py` is the default one. It can be altered but you will need to re-install the package in oreder for the changes to take effect.
+This will be deprecated and is maintained only for existing integrations. 
 
-`properties.json` can be edited and passed as a an environmental variable as: `export EQSS_CONFIG=<path to json config>` when running:
-- `gunicorn -b <host:port> --chdir sumstats/server --access-logfile <path to access log file> --error-logfile <path to error log file> app:app [--log-level <log level>]` to run the API
-- `eqtl-search` to search the database via command line
-- `eqtl-explore` to explore what is saved in the database via command line
 
+# Installation
 
-# Loading
+Requires the HDF5 library. Easiest way to manage this is to use the docker image. There's a public image: ebispot/eqtl-sumstats-api or for development you can clone this repo and run `docker build -t ebispot/eqtl-sumstats-api:local .`
 
-Data loading means converting tsv data to HDF5. It involves merging in data from FIVE sources - i) summary statistics, ii) variant data, iii) expression data, iv) phenotype metadata, v) rsids from dbsnp. These data are split and joined to produce CSV and HDF5 files, where each file represents a study X qtl group X tissue X chromosome.
-Once the package is installed you can load studies and search for studies using the command line toolkit.
+# Running
+## The web app
+`docker run -v <path to API_v2>:/files/output -p 8000:8000 -e HDF5_ROOT_DIR:/files/output ebispot/eqtl-sumstats-api:local uvicorn sumstats.main:app --host 0.0.0.0`
 
-```
-$ eqtl-load --help
-usage: eqtl-load [-h] [-f F] [-csv CSV] [-var VAR] [-phen PHEN] [-expr EXPR]
-                 [-study STUDY] [-qtl_group QTL_GROUP] [-quant QUANT]
-                 [-tissue TISSUE] [-chr CHR] -loader {study,trait,study_info}
-                 [-tissue_ont TISSUE_ONT] [-treatment TREATMENT]
-                 [-treatment_ont TREATMENT_ONT]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -f F                  The path to the summary statistics file to be
-                        processed
-  -csv CSV              The path to the csv OUT file
-  -var VAR              The path to the variant/genotype metadata file
-  -phen PHEN            The path to the trait/phenotype metadata file
-  -expr EXPR            The path to the gene expression file
-  -study STUDY          The study identifier
-  -qtl_group QTL_GROUP  The qtl group e.g. "LCL"
-  -quant QUANT          The quantification method e.g. "gene counts"
-  -tissue TISSUE        The tissue
-  -chr CHR              The chromosome the data belongs to
-  -loader {study,trait,study_info}
-                        The loader
-  -tissue_ont TISSUE_ONT
-                        The tissue ontology term
-  -treatment TREATMENT  The treatment
-  -treatment_ont TREATMENT_ONT
-                        The treatment ontology term
-```
-
-After conversion to HDF5, the file should be indexed (the fields to index can be modified in the [common_constants.py](sumstats/common_constants.py) file):
+Visit Swagger docs are here: http://127.0.0.1:8000/eqtl/api/docs
 
-```
-$ eqtl-reindex --help
-usage: eqtl-reindex [-h] -f F
-
-optional arguments:
-  -h, --help  show this help message and exit
-  -f F        The path to the HDF5 file to be processed
-```
 
-It is recommended to 'repack' the HDF5 to save disk space. Utilise the PyTables [`ptrepack`](https://www.pytables.org/usersguide/utilities.html#ptrepack) command line utility 
+## Data loading
 
+Data loading means converting tsv data to HDF5.
 
-# Exploring
-To explore the contents of the database you can use the following commands:
+### Convert sumstats TSVs to HDF5
 
+Requires nextflow and docker installation, or singularity if run `-with-singularity` instead of `-with-docker`
+The TSVs are in a single dir (--tsv_dir). 
+The HDF5 will be written to --hdf5_dir.
 ```
-$ eqtl-explore --help
-usage: eqtl-explore [-h] [-molecular_phenotypes]
-                    [-molecular_phenotype MOLECULAR_PHENOTYPE] [-studies]
-                    [-study STUDY] [-tissues] [-tissue TISSUE] [-chromosomes]
-                    [-genes]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -molecular_phenotypes
-                        List all the molecular_phenotypes
-  -molecular_phenotype MOLECULAR_PHENOTYPE
-                        List all the studies for a molecular_phenotype
-  -studies              List all the studies
-  -study STUDY          Will list 'trait: study' if it exists
-  -tissues              List all the tissues
-  -tissue TISSUE        Will list 'study: tissue' if it exists
-  -chromosomes          Will list all the chromosomes
-  -genes                List all the genes
+nextflow run sumstats/api_v2/workflows/tsv2hdf.nf --tsv_dir ./tsv/ --hdf5_dir ./hdf5/ -with-docker docker://ebispot/eqtl-sumstats-api:local
 ```
 
-
-# Searching 
-To retrieve data from the database you can use the following commands:
+The nextflow pipeline runs a CLI:
 ```
-$ eqtl-search --help
-usage: eqtl-search [-h] [-path PATH] [-all] [-start START] [-size SIZE]
-                   [-trait TRAIT] [-gene GENE] [-study STUDY] [-tissue TISSUE]
-                   [-qtl_group QTL_GROUP] [-snp SNP] [-chr CHR] [-pval PVAL]
-                   [-bp BP] [-quant_method {ge,tx,txrev,microarray,exon}]
-                   [-paginate [PAGINATE]]
+tsv2hdf --help
+usage: tsv2hdf [-h] -t T -hdf HDF -type {data,metadata}
 
 optional arguments:
   -h, --help            show this help message and exit
-  -path PATH            Full path to the dir where the h5files will be stored
-  -all                  Use argument if you want to search for all
-                        associations
-  -start START          Index of the first association retrieved
-  -size SIZE            Number of retrieved associations
-  -trait TRAIT          The trait I am looking for
-  -gene GENE            The gene I am looking for
-  -study STUDY          The study I am looking for
-  -tissue TISSUE        The tissue I am looking for
-  -qtl_group QTL_GROUP  The QTL group/context I am looking for
-  -snp SNP              The SNP I am looking for
-  -chr CHR              The chromosome I am looking for
-  -pval PVAL            Filter by pval threshold: -pval floor:ceil
-  -bp BP                Filter with baise pair location threshold: -bp
-                        floor:ceil
-  -quant_method {ge,tx,txrev,microarray,exon}
-                        The quantification method
-  -paginate [PAGINATE]  Sets paginate to "False" if you would like to fetch
-                        all associations for your query
+  -t T                  tsv path
+  -hdf HDF              hdf5 file label e.g. dataset1
+  -type {data,metadata}
+                        specify whether it is data or metadata
+```
+Nextflow pipeline will run the above cli for _each_ TSV (`$tsv_file`). The `$id` is the dataset id:
+```
+tsv2hdf -t $tsv_file -hdf $id -type data;
+```
+It will not run for the metadata. To run for the metadata do the following:
+`$tsv_file` is the metadata tsv e.g. [here](https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv)
+```
+tsv2hdf -t $tsv_file -hdf qtl_metadata -type metadata
 ```
-The data will by default be retrieved in batches of 20 snps and their info per query. You can loop through the data using the default size of 20 and updating the start flag as you go: `-start <start>`, or use the flags `-start <start> -size <size>` to specify the bandwith of your retrieval. The default value of `quant_method` is 'ge'. F
-
-There are two more flags that you can use:
-1. `-bp floor:ceil` e.g. `-bp 10000:20000000` that specifies the range of the base pair location in the chromosome that you want. Makes sense to use when querying for a chromosome, or a trait/study
-2. `-pval floor:ceil` e.g. `-pval 2e-10:5e-5` or `-pval 0.000003:4e-3` that specifies the p-value range of the results. 
-
-Note that, if the `output` directory is set by default to `./files/output` in the properties file. If you need to specify the location where it resides, modify the properties.json file and set `export EQSS_CONFIG=<path to json config>`  accordingly.
-
-The API uses the same search module and is more thoroughly documented [here](https://www.ebi.ac.uk/eqtl/api-docs/), which will explain the fields and constraints.
-
-# Exposing the API
-To expose the API you need to run: `gunicorn -b <host:port> --chdir sumstats/server --access-logfile <path to access log file> --error-logfile <path to error log file> app:app [--log-level <log level>]`
-
-You can set the environmental variable as: `export EQSS_CONFIG=<path to json config>` to change the default properties, such as the directory where all the data is stored (output directory) as explained in all the above sections.
-
-This will spin up the service and make it available on port 8080 (if running via docker, we exposed the port when we spinned up the container).
 
-You should be able to see the API on http://localhost:8080/
+## Project structure
+There are two versions of the API, v1, and v2. The code is seperated here [sumstats](sumstats), so that when the v1 api is ready to be removed, [sumstats/api_v1](sumstats/api_v1/) can simply be deleted. The [sumstats/main.py](sumstats/main.py) which is where the fast-api app is run, will also need to be updated if v1 is removed.
 
-# Deployment
-Kubernetes delpoyment has been configured with gitlab using [.gitlab-ci.yml](.gitlab-ci.yml) and [helm](eqtlss).
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -69,7 +69,7 @@ API quick reference
 
 With the exception of genomic region requests, all requests for associations can all be made using the ``/associations`` endpoint, adding and combining parameters as needed for filtering. 
 
-.. qrefflask:: sumstats.server.app:app
+.. qrefflask:: sumstats.api_v1.server.app:app
    :undoc-static:
    :order: path
 
@@ -363,6 +363,6 @@ A ``GET`` request is used to access the API.
 API detailed reference
 ======================
 
-.. autoflask:: sumstats.server.app:app
+.. autoflask:: sumstats.api_v1.server.app:app
    :undoc-static:
    :order: path
diff --git a/eqtlss/templates/deployment.yaml b/eqtlss/templates/deployment.yaml
@@ -17,6 +17,8 @@ spec:
     value: {{.Values.image.gid}}
   - name: USER
     value: {{.Values.image.user}}
+  - name: HDF5_ROOT_DIR
+    value: {{.Values.image.dataMountPath}}
   template:
     metadata:
       labels:
@@ -25,7 +27,7 @@ spec:
       containers:
       - name: {{.Values.service.name}}
         image: "{{.Values.image.repository}}:{{.Values.image.tag}}"
-        command: {{.Values.service.gunicornCmd}}
+        command: {{.Values.service.uvicornCmd}}
         imagePullPolicy: {{.Values.image.pullPolicy}}
         resources:
           requests:

diff --git a/eqtlss/values.yaml b/eqtlss/values.yaml
@@ -46,6 +46,7 @@ service:
   port: 8000
   path: /eqtl/api(/|$)(.*)
   gunicornCmd: '["gunicorn", "-b", "0.0.0.0:8000", "sumstats.server.app:app","--log-level=debug","--access-logfile=logs/ss_access.log","--error-logfile=logs/ss_error.log", "--workers=4"]'
+  uvicornCmd: '["uvicorn", "sumstats.main:app", "--host", "0.0.0.0", "--port", "8000", "--log-level=debug"]'
 
 staticService:
   replicaCount: 1

diff --git a/nginx/build/doctrees/environment.pickle b/nginx/build/doctrees/environment.pickle
diff --git a/nginx/build/doctrees/index.doctree b/nginx/build/doctrees/index.doctree
diff --git a/nginx/build/html/.buildinfo b/nginx/build/html/.buildinfo
diff --git a/nginx/build/html/.nojekyll b/nginx/build/html/.nojekyll