Skip to content

Commit

Permalink
unique identifier added
Browse files Browse the repository at this point in the history
  • Loading branch information
vivekruhela committed Apr 22, 2022
1 parent 1c8d33f commit 30a065b
Show file tree
Hide file tree
Showing 6 changed files with 917 additions and 96 deletions.
156 changes: 73 additions & 83 deletions Notebook/miRSim.ipynb
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,46 +19,14 @@
"from scipy.stats import gamma\n",
"from scipy.stats import uniform\n",
"from scipy.stats import expon\n",
"from scipy.stats import poisson"
"from scipy.stats import poisson\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c1a9746d75674500bca4c8099f5f1bb2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/vivek/anaconda3/lib/python3.7/site-packages/tqdm/std.py:648: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
" from pandas import Panel\n"
]
}
],
"source": [
"from tqdm import tqdm_notebook as tqdm\n",
"tqdm().pandas()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def alter_nt(seq_seed,position):\n",
Expand All @@ -76,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -118,7 +86,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -140,7 +108,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -184,7 +152,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -201,7 +169,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -246,7 +214,23 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def check_sequence(rna_dict, seq):\n",
" all_sequences = [v for _,v in rna_dict.items()]\n",
" if seq.upper() in all_sequences:\n",
" unique_flag = False\n",
" return unique_flag\n",
" else:\n",
" unique_flag = True\n",
" return unique_flag "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -284,7 +268,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -334,7 +318,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -351,17 +335,9 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed):\n",
" \n",
Expand Down Expand Up @@ -415,30 +391,46 @@
" mir_seq_new = rna_dict[mir]\n",
" mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
" elif seq_error == 'Seed_region':\n",
" mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
" mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
" unique_flag = False\n",
" while unique_flag == False:\n",
" mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
" mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
" unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
" elif seq_error == 'Outside_Seed_region':\n",
" mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
" mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
" unique_flag= False\n",
" while unique_flag == False:\n",
" mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
" mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
" unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
" elif seq_error == 'Both_region':\n",
" mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n",
" mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
" unique_flag= False\n",
" while unique_flag == False:\n",
" mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n",
" mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
" unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
"\n",
" loc = mir_location(gff_df,mir,seed)\n",
" mir_depth = int(exp)\n",
" line = ''\n",
" line += mir + '\\t' + seq_error + '\\t' + mir_cigar + '\\t' + rna_dict[mir] + '\\t' + mir_seq_new + '\\t' + loc[0] + '\\t' + str(loc[1]) + '\\t' + str(loc[2]) + '\\t' + str(mir_depth) + '\\n'\n",
" rna_ground_truth.write(line) \n",
" for dep in range(mir_depth):\n",
" fasta_seq.append('>' + mir)\n",
" if seq_error == 'None':\n",
" fasta_seq.append('>' + mir + '_no_alterations')\n",
" if seq_error == 'Seed_region':\n",
" fasta_seq.append('>' + mir + '_seed_alterations')\n",
" if seq_error == 'Seed_region':\n",
" fasta_seq.append('>' + mir + '_xseed_alterations')\n",
" if seq_error == 'Both_region':\n",
" fasta_seq.append('>' + mir + '_both_seed_&_xseed_alterations')\n",
" fasta_seq.append(mir_seq_new)\n",
"\n",
" return fasta_seq"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -488,7 +480,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -514,7 +506,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -569,7 +561,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -630,13 +622,14 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n",
" gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n",
" parallel_thread,*argv):\n",
"def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,\n",
" out_file_name,ground_truth_filename,gff_file,mir_type,\n",
" out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed,\n",
" no_mismatch_xseed,parallel_thread,*argv):\n",
" start = time.time()\n",
" \n",
" if no_mismatch_seed >= 6:\n",
Expand Down Expand Up @@ -682,22 +675,22 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"save_path = os.getcwd()\n",
"mir_file = '/mnt/miRSim/refs/mature_high_conf_hsa.fa'\n",
"mir_gff = '/mnt/miRSim/refs/hsa_high_conf.gff3'\n",
"mir_file = '../refs/mature_high_conf_hsa.fa'\n",
"mir_gff = '../refs/hsa_high_conf.gff3'\n",
"adaptor = 'TGGAATTCTCGGGTGCCAAGG'\n",
"parallel_thread = 12\n",
"pir_file = '/mnt/miRSim/refs/piRNAdb.hsa.v1_7_5.fa'\n",
"pir_gff = '/mnt/miRSim/refs/pirnadb.hg38.gff3'"
"parallel_thread = 10\n",
"pir_file = '../refs/piRNAdb.hsa.v1_7_5.fa'\n",
"pir_gff = '../refs/pirnadb.hg38.gff3'"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 19,
"metadata": {},
"outputs": [
{
Expand All @@ -708,7 +701,7 @@
"****Sequence with error in seed region are generated.****\n",
"****Sequence with error in xseed region are generated.****\n",
"****Sequence with error in both seed and xseed region are generated.****\n",
"Writing 12 temporary files with chunk size of 3334\n",
"Writing 10 temporary files with chunk size of 4000\n",
"------------------------------\n",
"writing 1 chunk\n",
"------------------------------\n",
Expand All @@ -727,19 +720,16 @@
"writing 8 chunk\n",
"------------------------------\n",
"writing 9 chunk\n",
"------------------------------\n",
"writing 10 chunk\n",
"------------------------------\n",
"writing 11 chunk\n",
"-----------------------------------------------------------------------\n",
"All chunks are written. Now merging all the temporary files and compressing it.\n",
"Runtime of the program is 0.47 minutes\n"
"Runtime of the program is 0.52 minutes\n"
]
}
],
"source": [
"miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz','mirna_ground_truth.csv',\n",
" mir_gff,'mirna','fastq',True,'poisson','None',adaptor,2,2,parallel_thread,10,10,10)"
"miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz',\n",
" 'mirna_ground_truth.csv',mir_gff,'mirna','fastq',True,\n",
" 'poisson','None',adaptor,2,2,parallel_thread,10,10,10)"
]
},
{
Expand Down Expand Up @@ -767,7 +757,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.9.7"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
Expand Down
9 changes: 3 additions & 6 deletions README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -193,22 +193,19 @@ Total Number of sequence = 500000
```

In order to generate such data you need to call miRSim module separately for each RNA type using following commands `(assuming minimum depth=default, file_type=fastq, encoding_quality=33, adaptor=default)`:
**For miRNA synthetic data**

```
For miRNA synthetic data
python miRSim.py -i refs/mature_high_conf_hsa.fa -n mirna_raw_data.fastq.gz -g mirna_ground_truth.csv -gff refs/hsa_high_conf.gff3 -t 500000 -st 20 -s 10 -x 10 -b 5 -se 1001 -th 6 -rna miRNA
```
**For piRNA synthetic data**

```
For piRNA synthetic data
python miRSim.py -i refs/piRNAdb.hsa.v1_7_5.fa -n pirna_raw_data.fastq.gz -g pirna_ground_truth.csv -gff refs/pirnadb.hg38.gff3 -t 500000 -st 10 -s 10 -x 5 -b 5 -se 1001 -th 6 -rna piRNA
```
**For novel miRNA synthetic data**

```
For novel miRNA synthetic data
python miRSim.py -i refs/final_novel_seq_filtered.fa -n novel_mirna_raw_data.fastq.gz -g novel_mirna_ground_truth.csv -gff refs/novel_gff.gff3 -t 500000 -st 10 -s 10 -x 3 -b 2 -se 1001 -th 6 -rna novelRNA
```
After generating synthetic data for each individual RNA, merge these fastq files and their ground truth csv. e.g.
Expand Down
10 changes: 10 additions & 0 deletions check_sequence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/python

def check_sequence(rna_dict, seq):
all_sequences = [v for _,v in rna_dict.items()]
if seq.upper() in all_sequences:
unique_flag = False
return unique_flag
else:
unique_flag = True
return unique_flag
Loading

0 comments on commit 30a065b

Please sign in to comment.