From 1b4f5bcfb1e9816ae5294b915a4d94d43f5417a0 Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Wed, 22 Apr 2020 17:32:53 +0100 Subject: [PATCH 1/9] update loader to take SE and not perform merge --- sumstats/common_constants.py | 4 +- sumstats/load.py | 75 +++++++++++++++++------------------- 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/sumstats/common_constants.py b/sumstats/common_constants.py index dec4b69..0bf6ba1 100644 --- a/sumstats/common_constants.py +++ b/sumstats/common_constants.py @@ -53,12 +53,12 @@ EFFECT_DSET, OTHER_DSET} -TO_LOAD_DSET_HEADERS_DEFAULT = {PHEN_DSET, SNP_DSET, PVAL_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTHER_DSET, BETA_DSET, MUTATION_DSET, AC_DSET, AN_DSET, FREQ_DSET, R2_DSET, EXPR_DSET, GENE_DSET, MTO_DSET, RSID_DSET} +TO_LOAD_DSET_HEADERS_DEFAULT = {PHEN_DSET, SNP_DSET, PVAL_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTHER_DSET, BETA_DSET, MUTATION_DSET, AC_DSET, AN_DSET, FREQ_DSET, R2_DSET, EXPR_DSET, GENE_DSET, MTO_DSET, RSID_DSET, SE_DSET} #TO_STORE_DSETS_DEFAULT = {SNP_DSET, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTHER_DSET, BETA_DSET, RSID_DSET, MUTATION_DSET, AC_DSET, AN_DSET, FREQ_DSET, R2_DSET, EXPR_DSET} #TO_QUERY_DSETS_DEFAULT = {SNP_DSET, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, BETA_DSET, RSID_DSET, MUTATION_DSET, AC_DSET, AN_DSET, FREQ_DSET, R2_DSET, MEAN_EXPR_DSET, # EFFECT_DSET, OTHER_DSET} # temp change tp pvalue instead of mantissa exp. -TO_STORE_DSETS_DEFAULT = {SNP_DSET, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTHER_DSET, BETA_DSET, RSID_DSET, MUTATION_DSET, AC_DSET, AN_DSET, FREQ_DSET, R2_DSET, EXPR_DSET} +TO_STORE_DSETS_DEFAULT = {SNP_DSET, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTHER_DSET, BETA_DSET, RSID_DSET, MUTATION_DSET, AC_DSET, AN_DSET, FREQ_DSET, SE_DSET, R2_DSET, EXPR_DSET} TO_QUERY_DSETS_DEFAULT = {SNP_DSET, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, BETA_DSET, RSID_DSET, MUTATION_DSET, AC_DSET, AN_DSET, FREQ_DSET, R2_DSET, EXPR_DSET, EFFECT_DSET, OTHER_DSET, TISSUE_DSET} TO_INDEX = [PHEN_DSET, BP_DSET, PVAL_DSET, SNP_DSET, GENE_DSET] diff --git a/sumstats/load.py b/sumstats/load.py index b3adb1e..1a4486e 100644 --- a/sumstats/load.py +++ b/sumstats/load.py @@ -82,37 +82,31 @@ def write_csv_to_hdf(self, hdf, group): """Read in the sumstats files in chunks""" dfss = pd.read_csv(self.tsv, sep="\t", - names=['molecular_trait_id', 'pchr', 'a', 'b', - 'strand', 'c', 'd', 'variant_ss', 'chromosome_ss', - 'position_ss', 'e', 'pvalue', 'beta', 'top'], - dtype={'chromosome_ss': str, 'position_ss': int, 'variant_ss': str}, - header=None, - usecols=['molecular_trait_id','variant_ss', 'chromosome_ss', - 'position_ss','pvalue', 'beta'], + dtype={'chromosome': str, 'position': int, 'variant': str}, float_precision='high', chunksize=1000000) - """Read in the variant file""" - dfvar = pd.read_csv(self.var_file, sep="\t", - names=['chromosome', 'position', 'variant', 'ref', 'alt', - 'type', 'ac', 'an', 'maf', 'r2', 'rsid'], - float_precision='high', skiprows=1, - dtype={'chromosome': str, 'position': int, 'variant': str}) - - """Read in the trait file""" - # set the column order - dftrait = pd.read_csv(self.trait_file, sep="\t", usecols=['phenotype_id', 'gene_id', 'group_id'])[['phenotype_id', 'gene_id', 'group_id']] - dftrait.columns = ['phenotype_id', 'gene_id', 'molecular_trait_object_id'] - - if self.expr_file: - """Read in the gene expression file""" - dfexpr = pd.read_csv(self.expr_file, sep="\t", float_precision='high', names=['phenotype_id', 'study', 'qtl_group', 'median_tpm']) - dfexpr = dfexpr[dfexpr.study == self.study] - dfexpr = dfexpr[dfexpr.qtl_group == self.qtl_group] - dfexpr["median_tpm"] = pd.to_numeric(dfexpr["median_tpm"], errors='coerce') - else: - print("no expression file") - dfexpr = pd.DataFrame(columns=['phenotype_id', 'study', 'qtl_group', 'median_tpm']) + #"""Read in the variant file""" + #dfvar = pd.read_csv(self.var_file, sep="\t", + # names=['chromosome', 'position', 'variant', 'ref', 'alt', + # 'type', 'ac', 'an', 'maf', 'r2', 'rsid'], + # float_precision='high', skiprows=1, + # dtype={'chromosome': str, 'position': int, 'variant': str}) + # + #"""Read in the trait file""" + ## set the column order + #dftrait = pd.read_csv(self.trait_file, sep="\t", usecols=['phenotype_id', 'gene_id', 'group_id'])[['phenotype_id', 'gene_id', 'group_id']] + #dftrait.columns = ['phenotype_id', 'gene_id', 'molecular_trait_object_id'] + # + #if self.expr_file: + # """Read in the gene expression file""" + # dfexpr = pd.read_csv(self.expr_file, sep="\t", float_precision='high', names=['phenotype_id', 'study', 'qtl_group', 'median_tpm']) + # dfexpr = dfexpr[dfexpr.study == self.study] + # dfexpr = dfexpr[dfexpr.qtl_group == self.qtl_group] + # dfexpr["median_tpm"] = pd.to_numeric(dfexpr["median_tpm"], errors='coerce') + #else: + # print("no expression file") + # dfexpr = pd.DataFrame(columns=['phenotype_id', 'study', 'qtl_group', 'median_tpm']) with pd.HDFStore(hdf) as store: """store in hdf5 as below""" @@ -120,17 +114,18 @@ def write_csv_to_hdf(self, hdf, group): for chunk in dfss: print(count) - merged = pd.merge(chunk, dfvar, how='left', left_on=['variant_ss'], right_on=['variant']) - print("merged one ") - merged2 = pd.merge(merged, dftrait, how='left', left_on=['molecular_trait_id'], right_on=['phenotype_id']) - print("merged two") - merged3 = pd.merge(merged2, dfexpr, how='left', left_on=['molecular_trait_id'], right_on=['phenotype_id']) - print("merged three") - merged3 = merged3[list(TO_LOAD_DSET_HEADERS_DEFAULT)] + #merged = pd.merge(chunk, dfvar, how='left', left_on=['variant_ss'], right_on=['variant']) + #print("merged one ") + #merged2 = pd.merge(merged, dftrait, how='left', left_on=['molecular_trait_id'], right_on=['phenotype_id']) + #print("merged two") + #merged3 = pd.merge(merged2, dfexpr, how='left', left_on=['molecular_trait_id'], right_on=['phenotype_id']) + #print("merged three") + #merged3 = merged3[list(TO_LOAD_DSET_HEADERS_DEFAULT)] + chunk = chunk[sorted(TO_LOAD_DSET_HEADERS_DEFAULT)] for field in [EFFECT_DSET, OTHER_DSET]: - self.placeholder_if_allele_string_too_long(df=merged3, field=field) - self.placeholder_if_variant_id_too_long(df=merged3, field=SNP_DSET) - merged3.to_hdf(store, group, + self.placeholder_if_allele_string_too_long(df=chunk, field=field) + self.placeholder_if_variant_id_too_long(df=chunk, field=SNP_DSET) + chunk.to_hdf(store, group, complib='blosc', complevel=9, format='table', @@ -155,10 +150,10 @@ def write_csv_to_hdf(self, hdf, group): 'quant_method': self.quant_method, 'trait_file': os.path.basename(self.trait_file)} if count == 1: - merged3.to_csv(self.csv_out, compression='gzip', columns=list(TO_LOAD_DSET_HEADERS_DEFAULT), + chunk.to_csv(self.csv_out, compression='gzip', columns=sorted(TO_LOAD_DSET_HEADERS_DEFAULT), index=False, mode='w', sep='\t', encoding='utf-8', na_rep="NA") else: - merged3.to_csv(self.csv_out, compression='gzip', columns=list(TO_LOAD_DSET_HEADERS_DEFAULT), + chunk.to_csv(self.csv_out, compression='gzip', columns=sorted(TO_LOAD_DSET_HEADERS_DEFAULT), header=False, index=False, mode='a', sep='\t', encoding='utf-8', na_rep="NA") count += 1 From 8478e842e45a1a0183d866f586fa6bfc053ee545 Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Thu, 23 Apr 2020 16:59:39 +0100 Subject: [PATCH 2/9] chromosome --> str --- sumstats/common_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumstats/common_constants.py b/sumstats/common_constants.py index 0bf6ba1..962dcf3 100644 --- a/sumstats/common_constants.py +++ b/sumstats/common_constants.py @@ -38,7 +38,7 @@ DSET_TYPES = {SNP_DSET: str, RSID_DSET: str, MUTATION_DSET: str, AC_DSET: int, AN_DSET: int, PVAL_DSET: str, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str, - CHR_DSET: int, BP_DSET: int, R2_DSET: float, BETA_DSET: float, SE_DSET: float, + CHR_DSET: str, BP_DSET: int, R2_DSET: float, BETA_DSET: float, SE_DSET: float, EFFECT_DSET: str, OTHER_DSET: str, FREQ_DSET: float, EXPR_DSET: float, TISSUE_DSET: str, QTL_GROUP_DSET: str, CONDITION_DSET: str, CONDITION_LABEL_DSET: str, TISSUE_LABEL_DSET: str} From 6821b51d10b400c3af6053b15f60d1a08b9444e8 Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Thu, 23 Apr 2020 18:13:17 +0100 Subject: [PATCH 3/9] hdf in append mode --- sumstats/load.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sumstats/load.py b/sumstats/load.py index 1a4486e..97d68e7 100644 --- a/sumstats/load.py +++ b/sumstats/load.py @@ -129,7 +129,7 @@ def write_csv_to_hdf(self, hdf, group): complib='blosc', complevel=9, format='table', - mode='w', + mode='a', append=True, data_columns=list(TO_INDEX), #expectedrows=num_rows, @@ -147,8 +147,8 @@ def write_csv_to_hdf(self, hdf, group): """Store study specific metadata""" store.get_storer(group).attrs.study_metadata = {'study': self.study, 'qtl_group': self.qtl_group, - 'quant_method': self.quant_method, - 'trait_file': os.path.basename(self.trait_file)} + 'quant_method': self.quant_method} + if count == 1: chunk.to_csv(self.csv_out, compression='gzip', columns=sorted(TO_LOAD_DSET_HEADERS_DEFAULT), index=False, mode='w', sep='\t', encoding='utf-8', na_rep="NA") From 0fcb4528637544b97b7f32d7f1d55bfced4e0a29 Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Thu, 23 Apr 2020 18:17:42 +0100 Subject: [PATCH 4/9] hdf in append mode --- sumstats/load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumstats/load.py b/sumstats/load.py index 97d68e7..0d06035 100644 --- a/sumstats/load.py +++ b/sumstats/load.py @@ -129,7 +129,7 @@ def write_csv_to_hdf(self, hdf, group): complib='blosc', complevel=9, format='table', - mode='a', + mode='w', append=True, data_columns=list(TO_INDEX), #expectedrows=num_rows, From 25225755c2a1f3c94b25236caf1515c32086268a Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Thu, 23 Apr 2020 18:29:06 +0100 Subject: [PATCH 5/9] hdf in append mode --- sumstats/load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumstats/load.py b/sumstats/load.py index 0d06035..97d68e7 100644 --- a/sumstats/load.py +++ b/sumstats/load.py @@ -129,7 +129,7 @@ def write_csv_to_hdf(self, hdf, group): complib='blosc', complevel=9, format='table', - mode='w', + mode='a', append=True, data_columns=list(TO_INDEX), #expectedrows=num_rows, From 22f81bdc44548207adf65cc324e2e85f574b6cc1 Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Tue, 28 Apr 2020 12:23:11 +0100 Subject: [PATCH 6/9] pvalue dtype --- sumstats/common_constants.py | 2 +- sumstats/load.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/sumstats/common_constants.py b/sumstats/common_constants.py index 962dcf3..0f6d9ca 100644 --- a/sumstats/common_constants.py +++ b/sumstats/common_constants.py @@ -37,7 +37,7 @@ #qtl_group, condition, condition_label, cell_type, ontology_term, ontology_label -DSET_TYPES = {SNP_DSET: str, RSID_DSET: str, MUTATION_DSET: str, AC_DSET: int, AN_DSET: int, PVAL_DSET: str, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str, +DSET_TYPES = {SNP_DSET: str, RSID_DSET: str, MUTATION_DSET: str, AC_DSET: int, AN_DSET: int, PVAL_DSET: float, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str, CHR_DSET: str, BP_DSET: int, R2_DSET: float, BETA_DSET: float, SE_DSET: float, EFFECT_DSET: str, OTHER_DSET: str, FREQ_DSET: float, EXPR_DSET: float, TISSUE_DSET: str, QTL_GROUP_DSET: str, CONDITION_DSET: str, CONDITION_LABEL_DSET: str, TISSUE_LABEL_DSET: str} diff --git a/sumstats/load.py b/sumstats/load.py index 97d68e7..21c7063 100644 --- a/sumstats/load.py +++ b/sumstats/load.py @@ -9,7 +9,7 @@ from sumstats.utils.properties_handler import properties from sumstats.utils import properties_handler from sumstats.utils import filesystem_utils as fsutils -import sumstats.utils.sqlite_client as sq +import sumstats.utils.sqlite_client as sq class Loader(): @@ -34,7 +34,7 @@ def __init__(self, tsv=None, csv_out=None, var_file=None, qtl_group=None, quant_ self.tissue_ont = tissue_ont self.treatment = treatment self.treatment_ont = treatment_ont - + self.filename = None if self.tsv: self.filename = os.path.splitext(os.path.basename(self.tsv))[0] @@ -82,10 +82,11 @@ def write_csv_to_hdf(self, hdf, group): """Read in the sumstats files in chunks""" dfss = pd.read_csv(self.tsv, sep="\t", - dtype={'chromosome': str, 'position': int, 'variant': str}, + dtype=DSET_TYPES, + usecols=list(TO_LOAD_DSET_HEADERS_DEFAULT), float_precision='high', chunksize=1000000) - + #"""Read in the variant file""" #dfvar = pd.read_csv(self.var_file, sep="\t", # names=['chromosome', 'position', 'variant', 'ref', 'alt', @@ -148,7 +149,7 @@ def write_csv_to_hdf(self, hdf, group): store.get_storer(group).attrs.study_metadata = {'study': self.study, 'qtl_group': self.qtl_group, 'quant_method': self.quant_method} - + if count == 1: chunk.to_csv(self.csv_out, compression='gzip', columns=sorted(TO_LOAD_DSET_HEADERS_DEFAULT), index=False, mode='w', sep='\t', encoding='utf-8', na_rep="NA") @@ -178,7 +179,7 @@ def load_study_info(self): trait_file blob not null, tissue_ontology blob not null, treatment blob, - treatment_ontology blob not null, + treatment_ontology blob not null, quant_method blob, UNIQUE (identifier) ); @@ -210,7 +211,7 @@ def main(): argparser.add_argument('-treatment_ont', help='The treatment ontology term', required=False) args = argparser.parse_args() - + properties_handler.set_properties() # pragma: no cover h5files_path = properties.h5files_path # pragma: no cover tsvfiles_path = properties.tsvfiles_path # pragma: no cover From 9e6677dcbe2a31f9483cf10967aa98c205c8fca8 Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Tue, 28 Apr 2020 13:33:30 +0100 Subject: [PATCH 7/9] int to int64 dtypes --- sumstats/common_constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sumstats/common_constants.py b/sumstats/common_constants.py index 0f6d9ca..58cc21a 100644 --- a/sumstats/common_constants.py +++ b/sumstats/common_constants.py @@ -37,8 +37,8 @@ #qtl_group, condition, condition_label, cell_type, ontology_term, ontology_label -DSET_TYPES = {SNP_DSET: str, RSID_DSET: str, MUTATION_DSET: str, AC_DSET: int, AN_DSET: int, PVAL_DSET: float, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str, - CHR_DSET: str, BP_DSET: int, R2_DSET: float, BETA_DSET: float, SE_DSET: float, +DSET_TYPES = {SNP_DSET: str, RSID_DSET: str, MUTATION_DSET: str, AC_DSET: float, AN_DSET: float, PVAL_DSET: str, MANTISSA_DSET: float, EXP_DSET: "int64", STUDY_DSET: str, + CHR_DSET: str, BP_DSET: "int64", R2_DSET: float, BETA_DSET: float, SE_DSET: float, EFFECT_DSET: str, OTHER_DSET: str, FREQ_DSET: float, EXPR_DSET: float, TISSUE_DSET: str, QTL_GROUP_DSET: str, CONDITION_DSET: str, CONDITION_LABEL_DSET: str, TISSUE_LABEL_DSET: str} From 7ff73cc1d094aaa8b1819ec49661ce0d4ecad72b Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Tue, 28 Apr 2020 13:57:45 +0100 Subject: [PATCH 8/9] pvalue dtype --- sumstats/common_constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumstats/common_constants.py b/sumstats/common_constants.py index 58cc21a..34153f7 100644 --- a/sumstats/common_constants.py +++ b/sumstats/common_constants.py @@ -37,7 +37,7 @@ #qtl_group, condition, condition_label, cell_type, ontology_term, ontology_label -DSET_TYPES = {SNP_DSET: str, RSID_DSET: str, MUTATION_DSET: str, AC_DSET: float, AN_DSET: float, PVAL_DSET: str, MANTISSA_DSET: float, EXP_DSET: "int64", STUDY_DSET: str, +DSET_TYPES = {SNP_DSET: str, RSID_DSET: str, MUTATION_DSET: str, AC_DSET: float, AN_DSET: float, PVAL_DSET: float, MANTISSA_DSET: float, EXP_DSET: "int64", STUDY_DSET: str, CHR_DSET: str, BP_DSET: "int64", R2_DSET: float, BETA_DSET: float, SE_DSET: float, EFFECT_DSET: str, OTHER_DSET: str, FREQ_DSET: float, EXPR_DSET: float, TISSUE_DSET: str, QTL_GROUP_DSET: str, CONDITION_DSET: str, CONDITION_LABEL_DSET: str, TISSUE_LABEL_DSET: str} From de4b5a42d43e0cf9b82708c48da690069b8faf75 Mon Sep 17 00:00:00 2001 From: jdhayhurst Date: Thu, 30 Apr 2020 18:09:51 +0100 Subject: [PATCH 9/9] chromosome from int to string, snp table loader optimisation --- sumstats/load.py | 1 - sumstats/utils/argument_utils.py | 2 -- sumstats/utils/sqlite_client.py | 21 ++++++++------- sumstats/utils/vcf_to_sqlite.py | 45 ++++++++++++++++++-------------- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/sumstats/load.py b/sumstats/load.py index 21c7063..df9e3f8 100644 --- a/sumstats/load.py +++ b/sumstats/load.py @@ -186,7 +186,6 @@ def load_study_info(self): """ sql = sq.sqlClient(self.sqldb) identifier = self.study + "+" + self.qtl_group + "+" + self.quant_method - print(self.trait_file) trait_file_id = os.path.basename(self.trait_file) data = [self.study, identifier, self.qtl_group, self.tissue, trait_file_id, self.tissue_ont, self.treatment, self.treatment_ont, self.quant_method ] sql.cur.execute("insert or ignore into study_info values (?,?,?,?,?,?,?,?,?)", data) diff --git a/sumstats/utils/argument_utils.py b/sumstats/utils/argument_utils.py index 7b824e2..12e4c8b 100644 --- a/sumstats/utils/argument_utils.py +++ b/sumstats/utils/argument_utils.py @@ -37,8 +37,6 @@ def convert_search_args(args): paginate = args.paginate chromosome = args.chr - if chromosome is not None: - chromosome = int(chromosome) pval_interval = args.pval pval_interval = FloatInterval().set_string_tuple(pval_interval) diff --git a/sumstats/utils/sqlite_client.py b/sumstats/utils/sqlite_client.py index 596dc6b..2e3e706 100644 --- a/sumstats/utils/sqlite_client.py +++ b/sumstats/utils/sqlite_client.py @@ -86,19 +86,19 @@ def get_study_context_meta(self, identifier): "condition_label": None } - #self.cur.execute("SELECT * FROM study_info where identifier =?", (identifier,)) + self.cur.execute("SELECT * FROM study_info where identifier =?", (identifier,)) - self.cur.execute(""" - SELECT s.study, s.identifier, q.qtl_group, q.cell_type, s.trait_file, q.ontology_term, q.condition, q.condition_label - FROM qtl_context_mapping AS q - JOIN study_info AS s - ON q.study = s.study AND q.qtl_group = s.qtl_group - WHERE s.identifier =? - """, (identifier,)) + #self.cur.execute(""" + # SELECT s.study, s.identifier, q.qtl_group, q.cell_type, s.trait_file, q.ontology_term, q.condition, q.condition_label + # FROM qtl_context_mapping AS q + # JOIN study_info AS s + # ON q.study = s.study AND q.qtl_group = s.qtl_group + # WHERE s.identifier =? + # """, (identifier,)) data = self.cur.fetchone() if data: - data_dict["study"], data_dict["identifier"], data_dict["qtl_group"], data_dict["tissue_label"], data_dict["phen"], data_dict["tissue_ont"], data_dict["condition"], data_dict["condition_label"] = data + data_dict["study"], data_dict["identifier"], data_dict["qtl_group"], data_dict["tissue_label"], data_dict["phen"], data_dict["tissue_ont"], data_dict["condition"], _ , data_dict["quant_method"], data_dict["condition_label"] = data return data_dict def get_traits(self): @@ -305,7 +305,8 @@ def commit(self): self.cur.execute("COMMIT") def drop_rsid_index(self): - self.cur.execute("DROP INDEX rsid_idx") + self.cur.execute("DROP INDEX IF EXISTS rsid_idx") + def create_rsid_index(self): self.cur.execute("CREATE INDEX rsid_idx on snp (rsid)") diff --git a/sumstats/utils/vcf_to_sqlite.py b/sumstats/utils/vcf_to_sqlite.py index 850e00c..a5033f9 100755 --- a/sumstats/utils/vcf_to_sqlite.py +++ b/sumstats/utils/vcf_to_sqlite.py @@ -6,31 +6,36 @@ def main(): argparser = argparse.ArgumentParser() - argparser.add_argument('-vcf', help='The name of the vcf to be processed', required=True) + argparser.add_argument('-vcf', help='The name of the vcf to be processed', required=False) argparser.add_argument('-db', help='The name of the database to load to', required=True) + argparser.add_argument('-index', help='create index on the rsid', required=False, action='store_true') args = argparser.parse_args() db = args.db - vcf = args.vcf + if args.vcf: + vcf = args.vcf - vcfdf = pd.read_csv(vcf, sep='\t', - comment='#', - header=None, - dtype=str, - usecols=[0, 1, 2], - names=['CHROM', 'POS', 'RSID'] - ) - - vcfdf.RSID = vcfdf.RSID.str.replace("rs","") - vcfdf.CHROM =vcfdf.CHROM.replace({'X': 23, 'Y': 24, 'MT': 25}) - - sql = sq.sqlClient(db) - sql.drop_rsid_index() - list_of_tuples = list(vcfdf.itertuples(index=False, name=None)) - sql.cur.execute('BEGIN TRANSACTION') - sql.cur.executemany("insert or ignore into snp(chr, position, rsid) values (?, ?, ?)", list_of_tuples) - sql.cur.execute('COMMIT') - sql.create_rsid_index() + vcfdf = pd.read_csv(vcf, sep='\t', + comment='#', + header=None, + dtype=str, + usecols=[0, 1, 2], + names=['CHROM', 'POS', 'RSID'] + ) + + vcfdf.RSID = vcfdf.RSID.str.replace("rs","") + sql = sq.sqlClient(db) + sql.drop_rsid_index() + list_of_tuples = list(vcfdf.itertuples(index=False, name=None)) + sql.cur.execute('BEGIN TRANSACTION') + sql.cur.executemany("insert or ignore into snp(chr, position, rsid) values (?, ?, ?)", list_of_tuples) + sql.cur.execute('COMMIT') + if args.index: + sql = sq.sqlClient(db) + sql.drop_rsid_index() + sql.create_rsid_index() + else: + print("nothing left to do") if __name__ == '__main__':