From 30a065bcb3a89c6c1512793da3d9ecd6b8fb14ca Mon Sep 17 00:00:00 2001 From: vivekruhela Date: Fri, 22 Apr 2022 09:37:07 +0000 Subject: [PATCH] unique identifier added --- Notebook/miRSim.ipynb | 156 ++++---- README.md | 9 +- check_sequence.py | 10 + generate_sequence.py | 32 +- miRSim.ipynb | 805 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 6 files changed, 917 insertions(+), 96 deletions(-) mode change 100644 => 100755 Notebook/miRSim.ipynb mode change 100644 => 100755 README.md create mode 100644 check_sequence.py mode change 100644 => 100755 generate_sequence.py create mode 100755 miRSim.ipynb diff --git a/Notebook/miRSim.ipynb b/Notebook/miRSim.ipynb old mode 100644 new mode 100755 index 863316e..78b3e2b --- a/Notebook/miRSim.ipynb +++ b/Notebook/miRSim.ipynb @@ -19,46 +19,14 @@ "from scipy.stats import gamma\n", "from scipy.stats import uniform\n", "from scipy.stats import expon\n", - "from scipy.stats import poisson" + "from scipy.stats import poisson\n", + "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c1a9746d75674500bca4c8099f5f1bb2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/vivek/anaconda3/lib/python3.7/site-packages/tqdm/std.py:648: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", - " from pandas import Panel\n" - ] - } - ], - "source": [ - "from tqdm import tqdm_notebook as tqdm\n", - "tqdm().pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [], "source": [ "def alter_nt(seq_seed,position):\n", @@ -76,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -118,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -140,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -184,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -201,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -246,7 +214,23 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def check_sequence(rna_dict, seq):\n", + " all_sequences = [v for _,v in rna_dict.items()]\n", + " if seq.upper() in all_sequences:\n", + " unique_flag = False\n", + " return unique_flag\n", + " else:\n", + " unique_flag = True\n", + " return unique_flag " + ] + }, + { + "cell_type": "code", + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -284,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -334,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -351,17 +335,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed):\n", " \n", @@ -415,14 +391,23 @@ " mir_seq_new = rna_dict[mir]\n", " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", " elif seq_error == 'Seed_region':\n", - " mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n", - " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " unique_flag = False\n", + " while unique_flag == False:\n", + " mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " unique_flag = check_sequence(rna_dict,mir_seq_new)\n", " elif seq_error == 'Outside_Seed_region':\n", - " mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n", - " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " unique_flag= False\n", + " while unique_flag == False:\n", + " mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " unique_flag = check_sequence(rna_dict,mir_seq_new)\n", " elif seq_error == 'Both_region':\n", - " mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n", - " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " unique_flag= False\n", + " while unique_flag == False:\n", + " mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " unique_flag = check_sequence(rna_dict,mir_seq_new)\n", "\n", " loc = mir_location(gff_df,mir,seed)\n", " mir_depth = int(exp)\n", @@ -430,7 +415,14 @@ " line += mir + '\\t' + seq_error + '\\t' + mir_cigar + '\\t' + rna_dict[mir] + '\\t' + mir_seq_new + '\\t' + loc[0] + '\\t' + str(loc[1]) + '\\t' + str(loc[2]) + '\\t' + str(mir_depth) + '\\n'\n", " rna_ground_truth.write(line) \n", " for dep in range(mir_depth):\n", - " fasta_seq.append('>' + mir)\n", + " if seq_error == 'None':\n", + " fasta_seq.append('>' + mir + '_no_alterations')\n", + " if seq_error == 'Seed_region':\n", + " fasta_seq.append('>' + mir + '_seed_alterations')\n", + " if seq_error == 'Seed_region':\n", + " fasta_seq.append('>' + mir + '_xseed_alterations')\n", + " if seq_error == 'Both_region':\n", + " fasta_seq.append('>' + mir + '_both_seed_&_xseed_alterations')\n", " fasta_seq.append(mir_seq_new)\n", "\n", " return fasta_seq" @@ -438,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -488,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -514,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -569,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -630,13 +622,14 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n", - " gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n", - " parallel_thread,*argv):\n", + "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,\n", + " out_file_name,ground_truth_filename,gff_file,mir_type,\n", + " out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed,\n", + " no_mismatch_xseed,parallel_thread,*argv):\n", " start = time.time()\n", " \n", " if no_mismatch_seed >= 6:\n", @@ -682,22 +675,22 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "save_path = os.getcwd()\n", - "mir_file = '/mnt/miRSim/refs/mature_high_conf_hsa.fa'\n", - "mir_gff = '/mnt/miRSim/refs/hsa_high_conf.gff3'\n", + "mir_file = '../refs/mature_high_conf_hsa.fa'\n", + "mir_gff = '../refs/hsa_high_conf.gff3'\n", "adaptor = 'TGGAATTCTCGGGTGCCAAGG'\n", - "parallel_thread = 12\n", - "pir_file = '/mnt/miRSim/refs/piRNAdb.hsa.v1_7_5.fa'\n", - "pir_gff = '/mnt/miRSim/refs/pirnadb.hg38.gff3'" + "parallel_thread = 10\n", + "pir_file = '../refs/piRNAdb.hsa.v1_7_5.fa'\n", + "pir_gff = '../refs/pirnadb.hg38.gff3'" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -708,7 +701,7 @@ "****Sequence with error in seed region are generated.****\n", "****Sequence with error in xseed region are generated.****\n", "****Sequence with error in both seed and xseed region are generated.****\n", - "Writing 12 temporary files with chunk size of 3334\n", + "Writing 10 temporary files with chunk size of 4000\n", "------------------------------\n", "writing 1 chunk\n", "------------------------------\n", @@ -727,19 +720,16 @@ "writing 8 chunk\n", "------------------------------\n", "writing 9 chunk\n", - "------------------------------\n", - "writing 10 chunk\n", - "------------------------------\n", - "writing 11 chunk\n", "-----------------------------------------------------------------------\n", "All chunks are written. Now merging all the temporary files and compressing it.\n", - "Runtime of the program is 0.47 minutes\n" + "Runtime of the program is 0.52 minutes\n" ] } ], "source": [ - "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz','mirna_ground_truth.csv',\n", - " mir_gff,'mirna','fastq',True,'poisson','None',adaptor,2,2,parallel_thread,10,10,10)" + "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz',\n", + " 'mirna_ground_truth.csv',mir_gff,'mirna','fastq',True,\n", + " 'poisson','None',adaptor,2,2,parallel_thread,10,10,10)" ] }, { @@ -767,7 +757,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.7" }, "latex_envs": { "LaTeX_envs_menu_present": true, diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 8ffb17e..e3dce9d --- a/README.md +++ b/README.md @@ -193,22 +193,19 @@ Total Number of sequence = 500000 ``` In order to generate such data you need to call miRSim module separately for each RNA type using following commands `(assuming minimum depth=default, file_type=fastq, encoding_quality=33, adaptor=default)`: +**For miRNA synthetic data** ``` -For miRNA synthetic data - python miRSim.py -i refs/mature_high_conf_hsa.fa -n mirna_raw_data.fastq.gz -g mirna_ground_truth.csv -gff refs/hsa_high_conf.gff3 -t 500000 -st 20 -s 10 -x 10 -b 5 -se 1001 -th 6 -rna miRNA ``` +**For piRNA synthetic data** ``` -For piRNA synthetic data - python miRSim.py -i refs/piRNAdb.hsa.v1_7_5.fa -n pirna_raw_data.fastq.gz -g pirna_ground_truth.csv -gff refs/pirnadb.hg38.gff3 -t 500000 -st 10 -s 10 -x 5 -b 5 -se 1001 -th 6 -rna piRNA ``` +**For novel miRNA synthetic data** ``` -For novel miRNA synthetic data - python miRSim.py -i refs/final_novel_seq_filtered.fa -n novel_mirna_raw_data.fastq.gz -g novel_mirna_ground_truth.csv -gff refs/novel_gff.gff3 -t 500000 -st 10 -s 10 -x 3 -b 2 -se 1001 -th 6 -rna novelRNA ``` After generating synthetic data for each individual RNA, merge these fastq files and their ground truth csv. e.g. diff --git a/check_sequence.py b/check_sequence.py new file mode 100644 index 0000000..c6a4a23 --- /dev/null +++ b/check_sequence.py @@ -0,0 +1,10 @@ +#!/usr/bin/python + +def check_sequence(rna_dict, seq): + all_sequences = [v for _,v in rna_dict.items()] + if seq.upper() in all_sequences: + unique_flag = False + return unique_flag + else: + unique_flag = True + return unique_flag \ No newline at end of file diff --git a/generate_sequence.py b/generate_sequence.py old mode 100644 new mode 100755 index 711b351..4168f65 --- a/generate_sequence.py +++ b/generate_sequence.py @@ -6,6 +6,7 @@ from cigar_generation import * from mir_location import * from sequence_alteration import * +from check_sequence import * def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed): @@ -24,6 +25,7 @@ def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, de chr_name = chr_list[i] + '$' mir_complete_list = list(gff_df[gff_df['chr'].str.contains(chr_name)].index) if mir_complete_list: +# print(i) total_exp = n_seq_per_chr[i] if not no_mir_chr[i] > len(mir_complete_list): mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(no_mir_chr[i]))] @@ -58,14 +60,23 @@ def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, de mir_seq_new = rna_dict[mir] mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new) elif seq_error == 'Seed_region': - mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed) - mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new) + unique_flag = False + while unique_flag == False: + mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed) + mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new) + unique_flag = check_sequence(rna_dict,mir_seq_new) elif seq_error == 'Outside_Seed_region': - mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed) - mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new) + unique_flag= False + while unique_flag == False: + mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed) + mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new) + unique_flag = check_sequence(rna_dict,mir_seq_new) elif seq_error == 'Both_region': - mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed) - mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new) + unique_flag= False + while unique_flag == False: + mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed) + mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new) + unique_flag = check_sequence(rna_dict,mir_seq_new) loc = mir_location(gff_df,mir,seed) mir_depth = int(exp) @@ -73,7 +84,14 @@ def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, de line += mir + '\t' + seq_error + '\t' + mir_cigar + '\t' + rna_dict[mir] + '\t' + mir_seq_new + '\t' + loc[0] + '\t' + str(loc[1]) + '\t' + str(loc[2]) + '\t' + str(mir_depth) + '\n' rna_ground_truth.write(line) for dep in range(mir_depth): - fasta_seq.append('>' + mir) + if seq_error == 'None': + fasta_seq.append('>' + mir + '_no_alterations') + if seq_error == 'Seed_region': + fasta_seq.append('>' + mir + '_seed_alterations') + if seq_error == 'Seed_region': + fasta_seq.append('>' + mir + '_xseed_alterations') + if seq_error == 'Both_region': + fasta_seq.append('>' + mir + '_both_seed_&_xseed_alterations') fasta_seq.append(mir_seq_new) return fasta_seq \ No newline at end of file diff --git a/miRSim.ipynb b/miRSim.ipynb new file mode 100755 index 0000000..7f2d3da --- /dev/null +++ b/miRSim.ipynb @@ -0,0 +1,805 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import shutil\n", + "import random\n", + "import sys\n", + "import threading\n", + "import gzip\n", + "import time\n", + "import warnings\n", + "from scipy.stats import gamma\n", + "from scipy.stats import uniform\n", + "from scipy.stats import expon\n", + "from scipy.stats import poisson\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def alter_nt(seq_seed,position):\n", + " if seq_seed[position] == 'A':\n", + " seq_seed[position] = 'C'\n", + " elif seq_seed[position] == 'T':\n", + " seq_seed[position] = 'G'\n", + " elif seq_seed[position] == 'G':\n", + " seq_seed[position] = 'A'\n", + " else:\n", + " seq_seed[position] = 'T'\n", + " \n", + " return seq_seed[position]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def sequence_alteration(mir_seq,site,no_mismatch_seed,no_mismatch_xseed, seed):\n", + " if isinstance(seed, int):\n", + " random.seed(seed)\n", + " if site == 'seed':\n", + " seq_seed = mir_seq[1:7]\n", + " seq_seed = list(seq_seed)\n", + " idx = list(range(1,7))\n", + " for i in range(no_mismatch_seed): \n", + " x1 = random.randint(0, len(idx)-1)\n", + " seq_seed[x1] = alter_nt(seq_seed,x1)\n", + " idx.pop(x1)\n", + " new_seed = \"\"\n", + " new_seed = new_seed.join(seq_seed)\n", + " mir_seq_new = mir_seq[:1] + new_seed + mir_seq[7:]\n", + " \n", + " elif site == 'xseed':\n", + " Xseq_seed = mir_seq[7:]\n", + " Xseq_seed = list(Xseq_seed)\n", + " idx = list(range(0,len(Xseq_seed)))\n", + " for j in range(no_mismatch_xseed): \n", + " x3 = random.randint(0, len(idx)-1)\n", + " Xseq_seed[x3] = alter_nt(Xseq_seed,x3)\n", + " idx.pop(x3)\n", + " new_xseed = \"\"\n", + " new_xseed = new_xseed.join(Xseq_seed)\n", + " mir_seq_new = mir_seq[:7] + new_xseed\n", + " \n", + " elif site == 'both':\n", + " no_mismatch_both = no_mismatch_seed + no_mismatch_xseed\n", + " for k in range(no_mismatch_both):\n", + " mir_seq1 = sequence_alteration(mir_seq,'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " mir_seq_new = sequence_alteration(mir_seq1,'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " \n", + " return mir_seq_new" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def cigar_generation(actual_seq,altered_seq):\n", + " seq1 = list(actual_seq)\n", + " seq2 = list(altered_seq)\n", + " cigar = []\n", + " if len(seq1) == len(seq2):\n", + " for idx in range(len(seq1)):\n", + " if seq1[idx] == seq2[idx]:\n", + " cigar.append('-')\n", + " else:\n", + " cigar.append(seq2[idx])\n", + " else:\n", + " cigar = ['-' for l in range(len(seq1))]\n", + " \n", + " return ''.join(cigar).upper()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def gff(gff_file,rna_type):\n", + " gff_file = open(gff_file).read().split('\\n')\n", + " df = pd.DataFrame()\n", + " if rna_type == 'mirna':\n", + " gff_file = gff_file[13:-1]\n", + " chr_loc = [c.split('\\t')[0] for c in gff_file]\n", + " mirna_type = [c.split('\\t')[2] for c in gff_file]\n", + " chr_start = [c.split('\\t')[3] for c in gff_file]\n", + " chr_end = [c.split('\\t')[4] for c in gff_file]\n", + " strand = [c.split('\\t')[6] for c in gff_file]\n", + " mir_id = [c.split('\\t')[8].split(';')[-2].split('=')[-1] for c in gff_file]\n", + " df['mir_id'] = mir_id\n", + " df['chr'] = chr_loc\n", + " df['chr_start'] = chr_start\n", + " df['chr_end'] = chr_end\n", + " df['strand'] = strand\n", + " df['mirna_type'] = mirna_type\n", + " df = df.set_index('mir_id')\n", + " df = df[~df['mirna_type'].str.contains('miRNA_primary_transcript')]\n", + " df = df.drop('mirna_type',axis=1)\n", + " else:\n", + " gff_file = gff_file[1:-1]\n", + " rna_id = [c.split('\\t')[-1] for c in gff_file]\n", + " chr_loc = [c.split('\\t')[0] for c in gff_file]\n", + " chr_start = [c.split('\\t')[3] for c in gff_file]\n", + " chr_end = [c.split('\\t')[4] for c in gff_file]\n", + " strand = [c.split('\\t')[6] for c in gff_file]\n", + " df['rna_id'] = rna_id\n", + " df['chr'] = chr_loc\n", + " df['chr_start'] = chr_start\n", + " df['chr_end'] = chr_end\n", + " df['strand'] = strand\n", + " df = df.set_index('rna_id')\n", + " \n", + " return df\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def mir_location(df,rna_name,seed): \n", + " if isinstance(seed, int):\n", + " random.seed(seed)\n", + " df1 = df.loc[[rna_name],:]\n", + " if df1.shape[0] == 1:\n", + " return [df1.iloc[0,0],df1.iloc[0,1],df1.iloc[0,2]]\n", + " else:\n", + " random_idx = random.randint(1,df1.shape[0]-1)\n", + " return [df1.iloc[random_idx,0],df1.iloc[random_idx,1],df1.iloc[random_idx,2]]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def sort_by_chromosome(chr_list):\n", + " a = [x.split('chr')[-1] for x in chr_list]\n", + " if 'X' in a: \n", + " X_present = True\n", + " a.remove('X')\n", + " else:\n", + " X_present = False\n", + " \n", + " if 'Y' in a: \n", + " Y_present = True\n", + " a.remove('Y')\n", + " else:\n", + " Y_present = False\n", + " if 'M' in a: \n", + " M_present = True\n", + " a.remove('M')\n", + " else:\n", + " M_present = False\n", + " if 'MT' in a:\n", + " MT_present = True\n", + " a.remove('MT')\n", + " else:\n", + " MT_present = False\n", + " \n", + " a = list(map(int,a))\n", + " b = sorted(a)\n", + " c = ['chr'+str(x) for x in b]\n", + " if X_present:\n", + " c.append('chrX')\n", + " if Y_present:\n", + " c.append('chrY')\n", + " if M_present:\n", + " c.append('chrM')\n", + " elif MT_present:\n", + " c.append('chrMT')\n", + " \n", + " return(c)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def sequence_calculation(total_no_seq, desired_seq_percent,depth,seed,gff_df):\n", + " \n", + " chr_list = sort_by_chromosome(list(gff_df['chr'].unique()))\n", + " if not desired_seq_percent == 0:\n", + " n_seq = total_no_seq*desired_seq_percent/100\n", + " chr_count = pd.DataFrame(pd.Series(gff_df['chr']).value_counts())\n", + " chr_count = pd.DataFrame(chr_count, index=chr_list)\n", + " n_seq_per_chr = [i*n_seq for i in list(chr_count['chr']/gff_df.shape[0])]\n", + " n_seq_per_chr = list(map(int,n_seq_per_chr))\n", + " n_seq_per_chr[-1] = int(n_seq - sum(n_seq_per_chr[:-1]))\n", + "\n", + " no_mir_chr = [] \n", + " actual_no_mir_chr = [len(list(gff_df[gff_df['chr'].str.contains(chr_name+'$')].index)) for chr_name in chr_list]\n", + " for v1,v2 in zip(n_seq_per_chr,actual_no_mir_chr):\n", + " if v1 > depth:\n", + " no_mir = len(expression_split(v1,v2,'poisson',seed,depth))\n", + " if no_mir >= 1:\n", + " no_mir_chr.append(no_mir)\n", + " else:\n", + " print('Error : Minimum number of miRs per chromosome should be %d. With your provided information it is %d'%(1, no_mir))\n", + " print('Hint: Try reducing minimum depth so that minimum number of miRs per chromosome becomes more than 1.')\n", + " sys.exit(1)\n", + " else:\n", + " no_mir = 1\n", + " no_mir_chr.append(no_mir)\n", + " else:\n", + " no_mir_chr = []\n", + " n_seq_per_chr = []\n", + "# print(n_seq_per_chr, len(n_seq_per_chr))\n", + " return no_mir_chr,n_seq_per_chr" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def expression_split(number, number_of_subsections, distribution, seed, min_random_number_desired):\n", + " split_number_list = []\n", + " cumulative_sum_of_random_numbers = 0\n", + " current_subsection = 1\n", + " max_random_number = int(number/number_of_subsections)\n", + " if isinstance(seed, int):\n", + " np.random.RandomState(seed)\n", + " else:\n", + " seed = np.random.RandomState()\n", + " if min_random_number_desired < number:\n", + " if min_random_number_desired > max_random_number:\n", + "# print(\"WARNING: Cannot have min number as {} and split {} in {} subsections\".format(min_random_number_desired, number, number_of_subsections))\n", + " number_of_subsections = int(np.floor(number/min_random_number_desired))\n", + " return expression_split(number,number_of_subsections,distribution, seed, min_random_number_desired)\n", + " \n", + " elif distribution == 'uniform':\n", + " split_num1 = uniform.rvs(size=number_of_subsections, loc = 10, scale=20,random_state=seed)\n", + " elif distribution == 'gamma':\n", + " split_num1 = gamma.rvs(a=5, size=number_of_subsections,random_state=seed)\n", + " elif distribution == 'exponential':\n", + " split_num1 = expon.rvs(scale=1,loc=0,size=number_of_subsections,random_state=seed)\n", + " elif distribution == 'poisson':\n", + " split_num1 = poisson.rvs(mu=3, size=number_of_subsections,random_state=seed)\n", + " try:\n", + " split_num1 = [int(number*v/sum(split_num1)) for v in split_num1]\n", + " except:\n", + " # Error may occur when split_num1 = [0]\n", + " expression_split(number, number_of_subsections, distribution, seed, min_random_number_desired)\n", + " if len(split_num1) > 1:\n", + " num_test = [1 for v in split_num1 if v <= min_random_number_desired]\n", + " if sum(num_test) > int(0.25*len(split_num1)):\n", + " return expression_split(number,int(number_of_subsections*0.75),distribution, seed, min_random_number_desired)\n", + " for i in range(len(split_num1)):\n", + " if split_num1[i] < min_random_number_desired:\n", + " split_num1[i] = min_random_number_desired \n", + " split_num1[-1] = number-sum(split_num1[:-1])\n", + " if sum([1 for v in split_num1 if v<=0]) >= 1:\n", + " return expression_split(number,int(number_of_subsections*0.75),distribution, seed, min_random_number_desired)\n", + " return split_num1\n", + " else:\n", + "# print('WARNING : minimum depth is greater than provided number and can not be splitted.')\n", + " expression_split(number,1,distribution, seed, number)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def update_gff(gff_df,out,ground_truth_filename):\n", + " rna_truth = pd.read_csv(os.path.join(out,ground_truth_filename),sep = '\\t')\n", + " rna_truth = rna_truth.set_index('RNA_ID')\n", + " for idx in range(rna_truth.shape[0]):\n", + " chr_start = str(rna_truth.iloc[idx,4])\n", + " chr_end = str(rna_truth.iloc[idx,5])\n", + " gff_df = gff_df[~gff_df.isin([chr_start,chr_end])]\n", + " gff_df = gff_df.dropna()\n", + " return gff_df" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed):\n", + " \n", + " if isinstance(seed, int):\n", + " random.seed(seed)\n", + " if write_mode == 'write':\n", + " rna_ground_truth = open(os.path.join(out,out_file),'w')\n", + " header = 'RNA_ID\\tImpure_Region\\tCigar_String\\tref_Sequence\\tsynthetic_sequence\\tchr\\tchr_start\\tchr_end\\tExpression_count\\n'\n", + " rna_ground_truth.write(header)\n", + " elif write_mode == 'append':\n", + " rna_ground_truth = open(os.path.join(out,out_file),'a')\n", + " \n", + " chr_list = list(gff_df['chr'].unique())\n", + " if no_mir_chr and n_seq_per_chr:\n", + " for i in range(len(chr_list)):\n", + " chr_name = chr_list[i] + '$'\n", + " mir_complete_list = list(gff_df[gff_df['chr'].str.contains(chr_name)].index) \n", + " if mir_complete_list:\n", + "# print(i)\n", + " total_exp = n_seq_per_chr[i]\n", + " if not no_mir_chr[i] > len(mir_complete_list): \n", + " mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(no_mir_chr[i]))]\n", + " mir_list = [mir_complete_list[v] for v in mir_idx]\n", + " else: \n", + " if repeat:\n", + " complete_flag = True\n", + " while complete_flag == True:\n", + " mir_complete_list += mir_complete_list\n", + " if no_mir_chr[i] < len(mir_complete_list):\n", + " complete_flag = False\n", + " mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(no_mir_chr[i]))]\n", + " mir_list = [mir_complete_list[v] for v in mir_idx]\n", + " else: \n", + " mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(mir_complete_list))]\n", + " mir_list = [mir_complete_list[v] for v in mir_idx]\n", + "\n", + " complete_flag = True\n", + " while complete_flag:\n", + " try:\n", + " expression_counts = expression_split(total_exp,len(mir_list),distribution,seed,depth)\n", + " if expression_counts:\n", + " complete_flag = False\n", + " except:\n", + " print('The number of RNAs and total available depth in %s is %d and %d too less with error %s' %(chr_name,len(mir_list),total_exp,seq_error))\n", + " expression_counts = expression_split(total_exp,len(mir_list),distribution,seed,int(depth*0.5))\n", + " if expression_counts:\n", + " complete_flag = False\n", + "\n", + " for mir,exp in zip(mir_list,expression_counts):\n", + " if seq_error == 'None':\n", + " mir_seq_new = rna_dict[mir]\n", + " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " elif seq_error == 'Seed_region':\n", + " mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " elif seq_error == 'Outside_Seed_region':\n", + " mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + " elif seq_error == 'Both_region':\n", + " mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n", + " mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n", + "\n", + " loc = mir_location(gff_df,mir,seed)\n", + " mir_depth = int(exp)\n", + " line = ''\n", + " line += mir + '\\t' + seq_error + '\\t' + mir_cigar + '\\t' + rna_dict[mir] + '\\t' + mir_seq_new + '\\t' + loc[0] + '\\t' + str(loc[1]) + '\\t' + str(loc[2]) + '\\t' + str(mir_depth) + '\\n'\n", + " rna_ground_truth.write(line) \n", + " for dep in range(mir_depth):\n", + " fasta_seq.append('>' + mir)\n", + " fasta_seq.append(mir_seq_new)\n", + "\n", + " return fasta_seq" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def write_small_fastq_chunks(fasta_seq_chunk,out_file,counter,out_file_type,quality_char,adaptor,seed):\n", + "\n", + " primer_nt = ['A','T','G','C']\n", + " for line in fasta_seq_chunk:\n", + " line1 = \"\"\n", + " line_no = fasta_seq_chunk.index(line)\n", + " if \">\" in line:\n", + " if out_file_type == 'fastq':\n", + " line1 += \"@\" + line[1:] + '-' + str(counter) + '\\n'\n", + " seq = fasta_seq_chunk[line_no+1] + adaptor # \"TGGAATTCTCGGGTGCCAAGG\"\n", + " primer_len = 75 - len(seq)\n", + " primer_string = ''.join(char*random.randint(0,100) for char in primer_nt)\n", + " primer_string = list(primer_string)\n", + " primer_string = random.sample(primer_string,len(primer_string))\n", + " primer_string = ''.join(c for c in primer_string)[:primer_len]\n", + " seq += primer_string \n", + " line1 += fasta_seq_chunk[line_no+1] + adaptor + primer_string + '\\n' # \"TGGAATTCTCGGGTGCCAAGG\" + '\\n'\n", + " line1 += '+' + '\\n'\n", + " quality_string = ''.join(char*random.randint(0,15) for char in quality_char)\n", + " quality_string = list(quality_string)\n", + " quality_string = random.sample(quality_string,len(quality_string))\n", + " quality_string = ''.join(c for c in quality_string)\n", + " quality_flag = False\n", + " while quality_flag:\n", + " if len(quality_string) < len(seq):\n", + " quality_string += quality_string\n", + " quality_string = list(quality_string)\n", + " quality_string = random.sample(quality_string,len(quality_string))\n", + " quality_string = ''.join(c for c in quality_string)\n", + " else:\n", + " quality_flag = True\n", + " break\n", + " line1 += quality_string[:len(seq)] + '\\n'\n", + " out_file.write(line1)\n", + " counter += 1\n", + " elif out_file_type == 'fasta':\n", + " line1 += line + '-' + str(counter) + '\\n'\n", + " line1 += fasta_seq_chunk[line_no+1] + adaptor + '\\n'\n", + " out_file.write(line1)\n", + " counter += 1\n", + "\n", + "# out_file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def execute_parallel_thread_for_file_write(fasta_seq_chunk,out_file,counter,out,out_file_type,quality_char,adaptor,seed):\n", + " max_no_line = len(fasta_seq_chunk)\n", + " if max_no_line > 10000:\n", + " chunk_size = 10000\n", + " no_of_splitted_file = int(np.ceil(max_no_line/chunk_size))\n", + " else:\n", + " chunk_size = max_no_line\n", + " no_of_splitted_file = 1\n", + " for j in range(no_of_splitted_file):\n", + " if out_file_type == 'fastq':\n", + " tmp_file_name = out_file + '_' + str(j) + '.fastq'\n", + " else:\n", + " tmp_file_name = out_file + '_' + str(j) + '.fasta'\n", + " \n", + " out_file1 = open(os.path.join(out,'temp',tmp_file_name),'w')\n", + " fasta_seq_small_chunk = fasta_seq_chunk[j*chunk_size:(j+1)*chunk_size]\n", + " counter1 = counter + j*len(fasta_seq_small_chunk)\n", + " write_small_fastq_chunks(fasta_seq_small_chunk,out_file1,counter1,out_file_type,quality_char,adaptor,seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def write_fastq(fasta_seq, adaptor, ascii_base, out, out_file_name,out_file_type,seed,parallel_thread):\n", + " \n", + " # Not considering phred score less then 20 so that mean quality score is always greater than 20 for smooth bioinformatics analysis\n", + " if ascii_base == 33:\n", + " quality_char = ['5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']\n", + " elif ascii_base == 64:\n", + " quality_char = ['T','U','V','W','X','Y','Z', '^',' ', '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']\n", + " else:\n", + " quality_char = []\n", + " \n", + " max_no_line = len(fasta_seq)\n", + " no_of_splitted_file = parallel_thread\n", + " chunk_size = int(np.ceil(max_no_line/no_of_splitted_file))\n", + " print('Writing %d temporary files with chunk size of %d'%(no_of_splitted_file,chunk_size))\n", + " if 'temp' in list(os.listdir(out)):\n", + " cmd = 'rm -rf '+ os.path.join(out,'temp','..?* .[!.]*') + ' && rm -rf '+ os.path.join(out,'temp') + ' 2> /dev/null'\n", + " os.system(cmd)\n", + "# shutil.rmtree(os.path.join(out,'temp'))\n", + " os.mkdir(os.path.join(out,'temp'))\n", + " else:\n", + " os.mkdir(os.path.join(out,'temp'))\n", + " \n", + " for j in range(no_of_splitted_file):\n", + " tmp_file_name = 'tmp_'+str(j) \n", + " out_file = tmp_file_name \n", + " fasta_seq_chunk = fasta_seq[j*chunk_size:(j+1)*chunk_size]\n", + " if j >=1:\n", + " counter = j*len(fasta_seq_chunk)\n", + " print('------------------------------')\n", + " print('writing %d chunk'%j)\n", + " threading.Thread(target=execute_parallel_thread_for_file_write,args=(fasta_seq_chunk,out_file,counter,out,out_file_type,quality_char,adaptor,seed,)).start()\n", + " else:\n", + " pass\n", + "\n", + " counter = 0\n", + " tmp_file_name = 'tmp_'+str(0)\n", + " out_file = tmp_file_name \n", + " fasta_seq_chunk = fasta_seq[0*chunk_size:(0+1)*chunk_size]\n", + " execute_parallel_thread_for_file_write(fasta_seq_chunk,out_file,counter,out,out_file_type,quality_char,adaptor,seed)\n", + " time.sleep(20)\n", + " print('-----------------------------------------------------------------------')\n", + " print('All chunks are written. Now merging all the temporary files and compressing it.')\n", + " if out_file_type == 'fastq':\n", + " cmd = 'gzip '+ os.path.join(out,'temp') +'/tmp_*.fastq -c > ' + os.path.join(out,out_file_name)\n", + " else:\n", + " cmd = 'gzip '+ os.path.join(out,'temp') + '/tmp_*.fasta -c > ' + os.path.join(out,out_file_name)\n", + " os.system(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_synthetic_data(fa_file,total_no_seq,std_seq_percent,seed_error_percent,xseed_error_percent, \n", + " both_error_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n", + " gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed ,\n", + " no_mismatch_xseed,parallel_thread):\n", + " \n", + " file_read = open(fa_file).read().split('\\n')\n", + " file_read = file_read[:-1]\n", + " rna_dict = {}\n", + " for line in file_read:\n", + " if \">\" in line:\n", + " line_no = file_read.index(line)\n", + " rna_dict[line[1:]] = file_read[line_no+1]\n", + "\n", + " fasta_seq = []\n", + " # for pure sequences only\n", + " gff_df = gff(gff_file,mir_type)\n", + " no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, std_seq_percent, depth,seed,gff_df) \n", + " fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'None', out, ground_truth_filename,'write',repeat,distribution, no_mismatch_seed , no_mismatch_xseed, seed)\n", + " print('****Pure Sequence are generated.****')\n", + "\n", + " # for seed region error sequences only\n", + " if not repeat:\n", + " gff_df = update_gff(gff_df,out,ground_truth_filename) # Update gff dataframe so that there is no repitition.\n", + " no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, seed_error_percent, depth,seed,gff_df)\n", + " fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Seed_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed, seed)\n", + " print('****Sequence with error in seed region are generated.****')\n", + "\n", + " # for outside seed region error sequences only\n", + " if not repeat:\n", + " gff_df = update_gff(gff_df,out,ground_truth_filename)\n", + " no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, xseed_error_percent, depth,seed,gff_df)\n", + " fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Outside_Seed_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed)\n", + " print('****Sequence with error in xseed region are generated.****')\n", + "\n", + " # for both seed region error and outside seed region error sequences\n", + " if not repeat:\n", + " gff_df = update_gff(gff_df,out,ground_truth_filename)\n", + " no_mir_chr, n_seq_per_chr = sequence_calculation(total_no_seq, both_error_percent, depth,seed,gff_df)\n", + " fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Both_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed)\n", + " print('****Sequence with error in both seed and xseed region are generated.****')\n", + "\n", + " write_fastq(fasta_seq, adaptor, ascii_base, out, out_file_name,out_file_type,seed,parallel_thread)\n", + "\n", + "# merge similar synthetic sequences\n", + " ground_truth_df = pd.read_csv(os.path.join(out,ground_truth_filename),sep='\\t')\n", + " ground_truth_df = ground_truth_df.groupby(['RNA_ID','Impure_Region','Cigar_String','ref_Sequence','synthetic_sequence','chr','chr_start','chr_end']).sum().reset_index()\n", + " ground_truth_df = ground_truth_df.sort_values('Impure_Region')\n", + " ground_truth_df.to_csv(os.path.join(out,ground_truth_filename),encoding='utf-8',index=False)\n", + "\n", + " # deleting temporary folder\n", + " cmd = 'rm -rf '+ os.path.join(out,'temp','..?* .[!.]*') + ' && rm -rf '+ os.path.join(out,'temp') + ' 2> /dev/null'\n", + " os.system(cmd)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n", + " gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n", + " parallel_thread,*argv):\n", + " start = time.time()\n", + " \n", + " if no_mismatch_seed >= 6:\n", + " sys.exit(\"Maximum number of mismatch in seed region should not be more than 6.\")\n", + " \n", + " if no_mismatch_xseed >= 12:\n", + " sys.exit(\"Maximum number of mismatch in xseed region should not be more than 12.\")\n", + " \n", + " if len(list(argv)) == 1:\n", + " non_rna = list(argv)[0]\n", + " seed_error_percent = int(non_rna/3)\n", + " xseed_error_percent = int(non_rna/3)\n", + " both_error_percent = non_rna - seed_error_percent - xseed_error_percent\n", + " \n", + " generate_synthetic_data(fa_file,total_no_seq,std_seq_percent,seed_error_percent,xseed_error_percent, \n", + " both_error_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n", + " gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed ,\n", + " no_mismatch_xseed,parallel_thread)\n", + " end = time.time()\n", + " time_taken = '%.2f' % ((end-start)/60)\n", + " print(f\"Runtime of the program is {time_taken} minutes\")\n", + " \n", + " \n", + " elif len(list(argv)) == 3: \n", + " seed_error_percent,xseed_error_percent,both_error_percent = list(argv)\n", + " generate_synthetic_data(fa_file,total_no_seq,std_seq_percent,seed_error_percent,xseed_error_percent, \n", + " both_error_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n", + " gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n", + " parallel_thread)\n", + " end = time.time()\n", + " time_taken = '%.2f' % ((end-start)/60)\n", + " print(f\"Runtime of the program is {time_taken} minutes\")\n", + " else:\n", + " sys.exit(\"Provide either non-RNA fraction or seed,xseed error fractions.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Change the path of required files**" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "save_path = os.getcwd()\n", + "mir_file = '/mnt/miRSim/refs/mature_high_conf_hsa.fa'\n", + "mir_gff = '/mnt/miRSim/refs/hsa_high_conf.gff3'\n", + "adaptor = 'TGGAATTCTCGGGTGCCAAGG'\n", + "parallel_thread = 12\n", + "pir_file = '/mnt/miRSim/refs/piRNAdb.hsa.v1_7_5.fa'\n", + "pir_gff = '/mnt/miRSim/refs/pirnadb.hg38.gff3'" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "****Pure Sequence are generated.****\n", + "****Sequence with error in seed region are generated.****\n", + "****Sequence with error in xseed region are generated.****\n", + "****Sequence with error in both seed and xseed region are generated.****\n", + "Writing 12 temporary files with chunk size of 3334\n", + "------------------------------\n", + "writing 1 chunk\n", + "------------------------------\n", + "writing 2 chunk\n", + "------------------------------\n", + "writing 3 chunk\n", + "------------------------------\n", + "writing 4 chunk\n", + "------------------------------\n", + "writing 5 chunk\n", + "------------------------------\n", + "writing 6 chunk\n", + "------------------------------\n", + "writing 7 chunk\n", + "------------------------------\n", + "writing 8 chunk\n", + "------------------------------\n", + "writing 9 chunk\n", + "------------------------------\n", + "writing 10 chunk\n", + "------------------------------\n", + "writing 11 chunk\n", + "-----------------------------------------------------------------------\n", + "All chunks are written. Now merging all the temporary files and compressing it.\n", + "Runtime of the program is 0.47 minutes\n" + ] + } + ], + "source": [ + "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz','mirna_ground_truth.csv',\n", + " mir_gff,'mirna','fastq',True,'poisson','None',adaptor,2,2,parallel_thread,10,10,10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "nbTranslate": { + "displayLangs": [ + "*" + ], + "hotkey": "alt-t", + "langInMainMenu": true, + "sourceLang": "en", + "targetLang": "fr", + "useGoogleTranslate": true + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.txt b/requirements.txt index 1ff3844..f950ac3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ scipy==1.4.1 threaded==4.1.0 numpy==1.18.1 pandas==1.0.1 +random \ No newline at end of file