Skip to content

Commit

Permalink
chromosome from int to string, snp table loader optimisation
Browse files Browse the repository at this point in the history
  • Loading branch information
jdhayhurst committed Apr 30, 2020
1 parent 7ff73cc commit de4b5a4
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 33 deletions.
1 change: 0 additions & 1 deletion sumstats/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ def load_study_info(self):
"""
sql = sq.sqlClient(self.sqldb)
identifier = self.study + "+" + self.qtl_group + "+" + self.quant_method
print(self.trait_file)
trait_file_id = os.path.basename(self.trait_file)
data = [self.study, identifier, self.qtl_group, self.tissue, trait_file_id, self.tissue_ont, self.treatment, self.treatment_ont, self.quant_method ]
sql.cur.execute("insert or ignore into study_info values (?,?,?,?,?,?,?,?,?)", data)
Expand Down
2 changes: 0 additions & 2 deletions sumstats/utils/argument_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ def convert_search_args(args):
paginate = args.paginate

chromosome = args.chr
if chromosome is not None:
chromosome = int(chromosome)

pval_interval = args.pval
pval_interval = FloatInterval().set_string_tuple(pval_interval)
Expand Down
21 changes: 11 additions & 10 deletions sumstats/utils/sqlite_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,19 @@ def get_study_context_meta(self, identifier):
"condition_label": None
}

#self.cur.execute("SELECT * FROM study_info where identifier =?", (identifier,))
self.cur.execute("SELECT * FROM study_info where identifier =?", (identifier,))

self.cur.execute("""
SELECT s.study, s.identifier, q.qtl_group, q.cell_type, s.trait_file, q.ontology_term, q.condition, q.condition_label
FROM qtl_context_mapping AS q
JOIN study_info AS s
ON q.study = s.study AND q.qtl_group = s.qtl_group
WHERE s.identifier =?
""", (identifier,))
#self.cur.execute("""
# SELECT s.study, s.identifier, q.qtl_group, q.cell_type, s.trait_file, q.ontology_term, q.condition, q.condition_label
# FROM qtl_context_mapping AS q
# JOIN study_info AS s
# ON q.study = s.study AND q.qtl_group = s.qtl_group
# WHERE s.identifier =?
# """, (identifier,))

data = self.cur.fetchone()
if data:
data_dict["study"], data_dict["identifier"], data_dict["qtl_group"], data_dict["tissue_label"], data_dict["phen"], data_dict["tissue_ont"], data_dict["condition"], data_dict["condition_label"] = data
data_dict["study"], data_dict["identifier"], data_dict["qtl_group"], data_dict["tissue_label"], data_dict["phen"], data_dict["tissue_ont"], data_dict["condition"], _ , data_dict["quant_method"], data_dict["condition_label"] = data
return data_dict

def get_traits(self):
Expand Down Expand Up @@ -305,7 +305,8 @@ def commit(self):
self.cur.execute("COMMIT")

def drop_rsid_index(self):
self.cur.execute("DROP INDEX rsid_idx")
self.cur.execute("DROP INDEX IF EXISTS rsid_idx")


def create_rsid_index(self):
self.cur.execute("CREATE INDEX rsid_idx on snp (rsid)")
45 changes: 25 additions & 20 deletions sumstats/utils/vcf_to_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,36 @@

def main():
argparser = argparse.ArgumentParser()
argparser.add_argument('-vcf', help='The name of the vcf to be processed', required=True)
argparser.add_argument('-vcf', help='The name of the vcf to be processed', required=False)
argparser.add_argument('-db', help='The name of the database to load to', required=True)
argparser.add_argument('-index', help='create index on the rsid', required=False, action='store_true')
args = argparser.parse_args()
db = args.db
vcf = args.vcf
if args.vcf:
vcf = args.vcf

vcfdf = pd.read_csv(vcf, sep='\t',
comment='#',
header=None,
dtype=str,
usecols=[0, 1, 2],
names=['CHROM', 'POS', 'RSID']
)

vcfdf.RSID = vcfdf.RSID.str.replace("rs","")
vcfdf.CHROM =vcfdf.CHROM.replace({'X': 23, 'Y': 24, 'MT': 25})

sql = sq.sqlClient(db)
sql.drop_rsid_index()
list_of_tuples = list(vcfdf.itertuples(index=False, name=None))
sql.cur.execute('BEGIN TRANSACTION')
sql.cur.executemany("insert or ignore into snp(chr, position, rsid) values (?, ?, ?)", list_of_tuples)
sql.cur.execute('COMMIT')
sql.create_rsid_index()
vcfdf = pd.read_csv(vcf, sep='\t',
comment='#',
header=None,
dtype=str,
usecols=[0, 1, 2],
names=['CHROM', 'POS', 'RSID']
)

vcfdf.RSID = vcfdf.RSID.str.replace("rs","")

sql = sq.sqlClient(db)
sql.drop_rsid_index()
list_of_tuples = list(vcfdf.itertuples(index=False, name=None))
sql.cur.execute('BEGIN TRANSACTION')
sql.cur.executemany("insert or ignore into snp(chr, position, rsid) values (?, ?, ?)", list_of_tuples)
sql.cur.execute('COMMIT')
if args.index:
sql = sq.sqlClient(db)
sql.drop_rsid_index()
sql.create_rsid_index()
else:
print("nothing left to do")


if __name__ == '__main__':
Expand Down

0 comments on commit de4b5a4

Please sign in to comment.