From 30a065bcb3a89c6c1512793da3d9ecd6b8fb14ca Mon Sep 17 00:00:00 2001
From: vivekruhela <vivekruhela@gmail.com>
Date: Fri, 22 Apr 2022 09:37:07 +0000
Subject: [PATCH] unique identifier added

---
 Notebook/miRSim.ipynb | 156 ++++----
 README.md             |   9 +-
 check_sequence.py     |  10 +
 generate_sequence.py  |  32 +-
 miRSim.ipynb          | 805 ++++++++++++++++++++++++++++++++++++++++++
 requirements.txt      |   1 +
 6 files changed, 917 insertions(+), 96 deletions(-)
 mode change 100644 => 100755 Notebook/miRSim.ipynb
 mode change 100644 => 100755 README.md
 create mode 100644 check_sequence.py
 mode change 100644 => 100755 generate_sequence.py
 create mode 100755 miRSim.ipynb

diff --git a/Notebook/miRSim.ipynb b/Notebook/miRSim.ipynb
old mode 100644
new mode 100755
index 863316e..78b3e2b
--- a/Notebook/miRSim.ipynb
+++ b/Notebook/miRSim.ipynb
@@ -19,46 +19,14 @@
     "from scipy.stats import gamma\n",
     "from scipy.stats import uniform\n",
     "from scipy.stats import expon\n",
-    "from scipy.stats import poisson"
+    "from scipy.stats import poisson\n",
+    "from tqdm import tqdm"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c1a9746d75674500bca4c8099f5f1bb2",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/vivek/anaconda3/lib/python3.7/site-packages/tqdm/std.py:648: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
-      "  from pandas import Panel\n"
-     ]
-    }
-   ],
-   "source": [
-    "from tqdm import tqdm_notebook as tqdm\n",
-    "tqdm().pandas()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
    "outputs": [],
    "source": [
     "def alter_nt(seq_seed,position):\n",
@@ -76,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -118,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -140,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -184,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -246,7 +214,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_sequence(rna_dict, seq):\n",
+    "    all_sequences = [v for _,v in rna_dict.items()]\n",
+    "    if seq.upper() in all_sequences:\n",
+    "        unique_flag = False\n",
+    "        return unique_flag\n",
+    "    else:\n",
+    "        unique_flag = True\n",
+    "        return unique_flag    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -334,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -351,17 +335,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed):\n",
     "    \n",
@@ -415,14 +391,23 @@
     "                        mir_seq_new = rna_dict[mir]\n",
     "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
     "                    elif seq_error == 'Seed_region':\n",
-    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
-    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                        unique_flag = False\n",
+    "                        while unique_flag == False:\n",
+    "                            mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                            unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
     "                    elif seq_error == 'Outside_Seed_region':\n",
-    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
-    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                        unique_flag= False\n",
+    "                        while unique_flag == False:\n",
+    "                            mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                            unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
     "                    elif seq_error == 'Both_region':\n",
-    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n",
-    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                        unique_flag= False\n",
+    "                        while unique_flag == False:\n",
+    "                            mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                            unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
     "\n",
     "                    loc = mir_location(gff_df,mir,seed)\n",
     "                    mir_depth = int(exp)\n",
@@ -430,7 +415,14 @@
     "                    line += mir + '\\t' + seq_error + '\\t' + mir_cigar + '\\t' + rna_dict[mir] + '\\t' + mir_seq_new + '\\t' + loc[0] + '\\t' + str(loc[1]) + '\\t' + str(loc[2]) + '\\t' + str(mir_depth) + '\\n'\n",
     "                    rna_ground_truth.write(line)            \n",
     "                    for dep in range(mir_depth):\n",
-    "                        fasta_seq.append('>' + mir)\n",
+    "                        if seq_error == 'None':\n",
+    "                            fasta_seq.append('>' + mir + '_no_alterations')\n",
+    "                        if seq_error == 'Seed_region':\n",
+    "                            fasta_seq.append('>' + mir + '_seed_alterations')\n",
+    "                        if seq_error == 'Seed_region':\n",
+    "                            fasta_seq.append('>' + mir + '_xseed_alterations')\n",
+    "                        if seq_error == 'Both_region':\n",
+    "                            fasta_seq.append('>' + mir + '_both_seed_&_xseed_alterations')\n",
     "                        fasta_seq.append(mir_seq_new)\n",
     "\n",
     "    return fasta_seq"
@@ -438,7 +430,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -488,7 +480,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,7 +506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -569,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -630,13 +622,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n",
-    "                       gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n",
-    "                       parallel_thread,*argv):\n",
+    "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,\n",
+    "                       out_file_name,ground_truth_filename,gff_file,mir_type,\n",
+    "                       out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed,\n",
+    "                        no_mismatch_xseed,parallel_thread,*argv):\n",
     "    start = time.time()\n",
     "    \n",
     "    if no_mismatch_seed >= 6:\n",
@@ -682,22 +675,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
     "save_path = os.getcwd()\n",
-    "mir_file = '/mnt/miRSim/refs/mature_high_conf_hsa.fa'\n",
-    "mir_gff = '/mnt/miRSim/refs/hsa_high_conf.gff3'\n",
+    "mir_file = '../refs/mature_high_conf_hsa.fa'\n",
+    "mir_gff = '../refs/hsa_high_conf.gff3'\n",
     "adaptor = 'TGGAATTCTCGGGTGCCAAGG'\n",
-    "parallel_thread = 12\n",
-    "pir_file = '/mnt/miRSim/refs/piRNAdb.hsa.v1_7_5.fa'\n",
-    "pir_gff = '/mnt/miRSim/refs/pirnadb.hg38.gff3'"
+    "parallel_thread = 10\n",
+    "pir_file = '../refs/piRNAdb.hsa.v1_7_5.fa'\n",
+    "pir_gff = '../refs/pirnadb.hg38.gff3'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -708,7 +701,7 @@
       "****Sequence with error in seed region are generated.****\n",
       "****Sequence with error in xseed region are generated.****\n",
       "****Sequence with error in both seed and xseed region are generated.****\n",
-      "Writing 12 temporary files with chunk size of 3334\n",
+      "Writing 10 temporary files with chunk size of 4000\n",
       "------------------------------\n",
       "writing 1 chunk\n",
       "------------------------------\n",
@@ -727,19 +720,16 @@
       "writing 8 chunk\n",
       "------------------------------\n",
       "writing 9 chunk\n",
-      "------------------------------\n",
-      "writing 10 chunk\n",
-      "------------------------------\n",
-      "writing 11 chunk\n",
       "-----------------------------------------------------------------------\n",
       "All chunks are written. Now merging all the temporary files and compressing it.\n",
-      "Runtime of the program is 0.47 minutes\n"
+      "Runtime of the program is 0.52 minutes\n"
      ]
     }
    ],
    "source": [
-    "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz','mirna_ground_truth.csv',\n",
-    "                        mir_gff,'mirna','fastq',True,'poisson','None',adaptor,2,2,parallel_thread,10,10,10)"
+    "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz',\n",
+    "                   'mirna_ground_truth.csv',mir_gff,'mirna','fastq',True,\n",
+    "                   'poisson','None',adaptor,2,2,parallel_thread,10,10,10)"
    ]
   },
   {
@@ -767,7 +757,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.9.7"
   },
   "latex_envs": {
    "LaTeX_envs_menu_present": true,
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
index 8ffb17e..e3dce9d
--- a/README.md
+++ b/README.md
@@ -193,22 +193,19 @@ Total Number of sequence = 500000
 ```
 
 In order to generate such data you need to call miRSim module separately for each RNA type using following commands `(assuming minimum depth=default, file_type=fastq, encoding_quality=33, adaptor=default)`:
+**For miRNA synthetic data**
 
 ```
-For miRNA synthetic data
-
 python miRSim.py -i refs/mature_high_conf_hsa.fa -n mirna_raw_data.fastq.gz -g mirna_ground_truth.csv -gff refs/hsa_high_conf.gff3 -t 500000 -st 20 -s 10 -x 10 -b 5 -se 1001 -th 6 -rna miRNA
 ```
+**For piRNA synthetic data**
 
 ```
-For piRNA synthetic data
-
 python miRSim.py -i refs/piRNAdb.hsa.v1_7_5.fa -n pirna_raw_data.fastq.gz -g pirna_ground_truth.csv -gff refs/pirnadb.hg38.gff3 -t 500000 -st 10 -s 10 -x 5 -b 5 -se 1001 -th 6 -rna piRNA
 ```
+**For novel miRNA synthetic data**
 
 ```
-For novel miRNA synthetic data
-
 python miRSim.py -i refs/final_novel_seq_filtered.fa -n novel_mirna_raw_data.fastq.gz -g novel_mirna_ground_truth.csv -gff refs/novel_gff.gff3 -t 500000 -st 10 -s 10 -x 3 -b 2 -se 1001 -th 6 -rna novelRNA
 ```
 After generating synthetic data for each individual RNA, merge these fastq files and their ground truth csv. e.g.
diff --git a/check_sequence.py b/check_sequence.py
new file mode 100644
index 0000000..c6a4a23
--- /dev/null
+++ b/check_sequence.py
@@ -0,0 +1,10 @@
+#!/usr/bin/python
+
+def check_sequence(rna_dict, seq):
+    all_sequences = [v for _,v in rna_dict.items()]
+    if seq.upper() in all_sequences:
+        unique_flag = False
+        return unique_flag
+    else:
+        unique_flag = True
+        return unique_flag 
\ No newline at end of file
diff --git a/generate_sequence.py b/generate_sequence.py
old mode 100644
new mode 100755
index 711b351..4168f65
--- a/generate_sequence.py
+++ b/generate_sequence.py
@@ -6,6 +6,7 @@
 from cigar_generation import *
 from mir_location import *
 from sequence_alteration import *
+from check_sequence import *
 
 def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed):
     
@@ -24,6 +25,7 @@ def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, de
             chr_name = chr_list[i] + '$'
             mir_complete_list = list(gff_df[gff_df['chr'].str.contains(chr_name)].index)   
             if mir_complete_list:
+#                 print(i)
                 total_exp = n_seq_per_chr[i]
                 if not no_mir_chr[i] > len(mir_complete_list):                    
                     mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(no_mir_chr[i]))]
@@ -58,14 +60,23 @@ def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, de
                         mir_seq_new = rna_dict[mir]
                         mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)
                     elif seq_error == 'Seed_region':
-                        mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)
-                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)
+                        unique_flag = False
+                        while unique_flag == False:
+                            mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)
+                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)
+                            unique_flag = check_sequence(rna_dict,mir_seq_new)
                     elif seq_error == 'Outside_Seed_region':
-                        mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)
-                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)
+                        unique_flag= False
+                        while unique_flag == False:
+                            mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)
+                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)
+                            unique_flag = check_sequence(rna_dict,mir_seq_new)
                     elif seq_error == 'Both_region':
-                        mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)
-                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)
+                        unique_flag= False
+                        while unique_flag == False:
+                            mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)
+                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)
+                            unique_flag = check_sequence(rna_dict,mir_seq_new)
 
                     loc = mir_location(gff_df,mir,seed)
                     mir_depth = int(exp)
@@ -73,7 +84,14 @@ def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, de
                     line += mir + '\t' + seq_error + '\t' + mir_cigar + '\t' + rna_dict[mir] + '\t' + mir_seq_new + '\t' + loc[0] + '\t' + str(loc[1]) + '\t' + str(loc[2]) + '\t' + str(mir_depth) + '\n'
                     rna_ground_truth.write(line)            
                     for dep in range(mir_depth):
-                        fasta_seq.append('>' + mir)
+                        if seq_error == 'None':
+                            fasta_seq.append('>' + mir + '_no_alterations')
+                        if seq_error == 'Seed_region':
+                            fasta_seq.append('>' + mir + '_seed_alterations')
+                        if seq_error == 'Seed_region':
+                            fasta_seq.append('>' + mir + '_xseed_alterations')
+                        if seq_error == 'Both_region':
+                            fasta_seq.append('>' + mir + '_both_seed_&_xseed_alterations')
                         fasta_seq.append(mir_seq_new)
 
     return fasta_seq
\ No newline at end of file
diff --git a/miRSim.ipynb b/miRSim.ipynb
new file mode 100755
index 0000000..7f2d3da
--- /dev/null
+++ b/miRSim.ipynb
@@ -0,0 +1,805 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import shutil\n",
+    "import random\n",
+    "import sys\n",
+    "import threading\n",
+    "import gzip\n",
+    "import time\n",
+    "import warnings\n",
+    "from scipy.stats import gamma\n",
+    "from scipy.stats import uniform\n",
+    "from scipy.stats import expon\n",
+    "from scipy.stats import poisson\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def alter_nt(seq_seed,position):\n",
+    "    if seq_seed[position] == 'A':\n",
+    "        seq_seed[position] = 'C'\n",
+    "    elif seq_seed[position] == 'T':\n",
+    "        seq_seed[position] = 'G'\n",
+    "    elif seq_seed[position] == 'G':\n",
+    "        seq_seed[position] = 'A'\n",
+    "    else:\n",
+    "        seq_seed[position] = 'T'\n",
+    "        \n",
+    "    return seq_seed[position]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sequence_alteration(mir_seq,site,no_mismatch_seed,no_mismatch_xseed, seed):\n",
+    "    if isinstance(seed, int):\n",
+    "        random.seed(seed)\n",
+    "    if site == 'seed':\n",
+    "        seq_seed = mir_seq[1:7]\n",
+    "        seq_seed = list(seq_seed)\n",
+    "        idx = list(range(1,7))\n",
+    "        for i in range(no_mismatch_seed):            \n",
+    "            x1 = random.randint(0, len(idx)-1)\n",
+    "            seq_seed[x1] = alter_nt(seq_seed,x1)\n",
+    "            idx.pop(x1)\n",
+    "            new_seed = \"\"\n",
+    "            new_seed = new_seed.join(seq_seed)\n",
+    "        mir_seq_new = mir_seq[:1] + new_seed + mir_seq[7:]\n",
+    "        \n",
+    "    elif site == 'xseed':\n",
+    "        Xseq_seed = mir_seq[7:]\n",
+    "        Xseq_seed = list(Xseq_seed)\n",
+    "        idx = list(range(0,len(Xseq_seed)))\n",
+    "        for  j in range(no_mismatch_xseed):            \n",
+    "            x3 = random.randint(0, len(idx)-1)\n",
+    "            Xseq_seed[x3] = alter_nt(Xseq_seed,x3)\n",
+    "            idx.pop(x3)\n",
+    "            new_xseed = \"\"\n",
+    "            new_xseed = new_xseed.join(Xseq_seed)\n",
+    "        mir_seq_new = mir_seq[:7] + new_xseed\n",
+    "        \n",
+    "    elif site == 'both':\n",
+    "        no_mismatch_both = no_mismatch_seed + no_mismatch_xseed\n",
+    "        for k in range(no_mismatch_both):\n",
+    "            mir_seq1 = sequence_alteration(mir_seq,'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "            mir_seq_new = sequence_alteration(mir_seq1,'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "    \n",
+    "    return mir_seq_new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cigar_generation(actual_seq,altered_seq):\n",
+    "    seq1 = list(actual_seq)\n",
+    "    seq2 = list(altered_seq)\n",
+    "    cigar = []\n",
+    "    if len(seq1) == len(seq2):\n",
+    "        for idx in range(len(seq1)):\n",
+    "            if seq1[idx] == seq2[idx]:\n",
+    "                cigar.append('-')\n",
+    "            else:\n",
+    "                cigar.append(seq2[idx])\n",
+    "    else:\n",
+    "        cigar = ['-' for l in range(len(seq1))]\n",
+    "        \n",
+    "    return ''.join(cigar).upper()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gff(gff_file,rna_type):\n",
+    "    gff_file = open(gff_file).read().split('\\n')\n",
+    "    df = pd.DataFrame()\n",
+    "    if rna_type == 'mirna':\n",
+    "        gff_file = gff_file[13:-1]\n",
+    "        chr_loc = [c.split('\\t')[0] for c in gff_file]\n",
+    "        mirna_type = [c.split('\\t')[2] for c in gff_file]\n",
+    "        chr_start = [c.split('\\t')[3] for c in gff_file]\n",
+    "        chr_end = [c.split('\\t')[4] for c in gff_file]\n",
+    "        strand = [c.split('\\t')[6] for c in gff_file]\n",
+    "        mir_id = [c.split('\\t')[8].split(';')[-2].split('=')[-1] for c in gff_file]\n",
+    "        df['mir_id'] = mir_id\n",
+    "        df['chr'] = chr_loc\n",
+    "        df['chr_start'] = chr_start\n",
+    "        df['chr_end'] = chr_end\n",
+    "        df['strand'] = strand\n",
+    "        df['mirna_type'] = mirna_type\n",
+    "        df = df.set_index('mir_id')\n",
+    "        df = df[~df['mirna_type'].str.contains('miRNA_primary_transcript')]\n",
+    "        df = df.drop('mirna_type',axis=1)\n",
+    "    else:\n",
+    "        gff_file = gff_file[1:-1]\n",
+    "        rna_id = [c.split('\\t')[-1] for c in gff_file]\n",
+    "        chr_loc = [c.split('\\t')[0] for c in gff_file]\n",
+    "        chr_start = [c.split('\\t')[3] for c in gff_file]\n",
+    "        chr_end = [c.split('\\t')[4] for c in gff_file]\n",
+    "        strand = [c.split('\\t')[6] for c in gff_file]\n",
+    "        df['rna_id'] = rna_id\n",
+    "        df['chr'] = chr_loc\n",
+    "        df['chr_start'] = chr_start\n",
+    "        df['chr_end'] = chr_end\n",
+    "        df['strand'] = strand\n",
+    "        df = df.set_index('rna_id')\n",
+    "    \n",
+    "    return df\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def mir_location(df,rna_name,seed):        \n",
+    "    if isinstance(seed, int):\n",
+    "        random.seed(seed)\n",
+    "    df1 = df.loc[[rna_name],:]\n",
+    "    if df1.shape[0] == 1:\n",
+    "        return [df1.iloc[0,0],df1.iloc[0,1],df1.iloc[0,2]]\n",
+    "    else:\n",
+    "        random_idx = random.randint(1,df1.shape[0]-1)\n",
+    "        return [df1.iloc[random_idx,0],df1.iloc[random_idx,1],df1.iloc[random_idx,2]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sort_by_chromosome(chr_list):\n",
+    "    a = [x.split('chr')[-1] for x in chr_list]\n",
+    "    if 'X' in a: \n",
+    "        X_present = True\n",
+    "        a.remove('X')\n",
+    "    else:\n",
+    "        X_present = False\n",
+    "        \n",
+    "    if 'Y' in a: \n",
+    "        Y_present = True\n",
+    "        a.remove('Y')\n",
+    "    else:\n",
+    "        Y_present = False\n",
+    "    if 'M' in a: \n",
+    "        M_present = True\n",
+    "        a.remove('M')\n",
+    "    else:\n",
+    "        M_present = False\n",
+    "    if 'MT' in a:\n",
+    "        MT_present = True\n",
+    "        a.remove('MT')\n",
+    "    else:\n",
+    "        MT_present = False\n",
+    "        \n",
+    "    a = list(map(int,a))\n",
+    "    b = sorted(a)\n",
+    "    c = ['chr'+str(x) for x in b]\n",
+    "    if X_present:\n",
+    "        c.append('chrX')\n",
+    "    if Y_present:\n",
+    "        c.append('chrY')\n",
+    "    if M_present:\n",
+    "        c.append('chrM')\n",
+    "    elif MT_present:\n",
+    "        c.append('chrMT')\n",
+    "        \n",
+    "    return(c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sequence_calculation(total_no_seq, desired_seq_percent,depth,seed,gff_df):\n",
+    "    \n",
+    "    chr_list = sort_by_chromosome(list(gff_df['chr'].unique()))\n",
+    "    if not desired_seq_percent == 0:\n",
+    "        n_seq = total_no_seq*desired_seq_percent/100\n",
+    "        chr_count = pd.DataFrame(pd.Series(gff_df['chr']).value_counts())\n",
+    "        chr_count = pd.DataFrame(chr_count, index=chr_list)\n",
+    "        n_seq_per_chr = [i*n_seq for i in list(chr_count['chr']/gff_df.shape[0])]\n",
+    "        n_seq_per_chr = list(map(int,n_seq_per_chr))\n",
+    "        n_seq_per_chr[-1] = int(n_seq - sum(n_seq_per_chr[:-1]))\n",
+    "\n",
+    "        no_mir_chr = []    \n",
+    "        actual_no_mir_chr = [len(list(gff_df[gff_df['chr'].str.contains(chr_name+'$')].index)) for chr_name in chr_list]\n",
+    "        for v1,v2 in zip(n_seq_per_chr,actual_no_mir_chr):\n",
+    "            if v1 > depth:\n",
+    "                no_mir = len(expression_split(v1,v2,'poisson',seed,depth))\n",
+    "                if no_mir >= 1:\n",
+    "                    no_mir_chr.append(no_mir)\n",
+    "                else:\n",
+    "                    print('Error : Minimum number of miRs per chromosome should be %d. With your provided information it is %d'%(1, no_mir))\n",
+    "                    print('Hint: Try reducing minimum depth so that minimum number of miRs per chromosome becomes more than 1.')\n",
+    "                    sys.exit(1)\n",
+    "            else:\n",
+    "                no_mir = 1\n",
+    "                no_mir_chr.append(no_mir)\n",
+    "    else:\n",
+    "        no_mir_chr = []\n",
+    "        n_seq_per_chr = []\n",
+    "#     print(n_seq_per_chr, len(n_seq_per_chr))\n",
+    "    return no_mir_chr,n_seq_per_chr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def expression_split(number, number_of_subsections, distribution, seed, min_random_number_desired):\n",
+    "    split_number_list = []\n",
+    "    cumulative_sum_of_random_numbers = 0\n",
+    "    current_subsection = 1\n",
+    "    max_random_number = int(number/number_of_subsections)\n",
+    "    if isinstance(seed, int):\n",
+    "        np.random.RandomState(seed)\n",
+    "    else:\n",
+    "        seed = np.random.RandomState()\n",
+    "    if min_random_number_desired < number:\n",
+    "        if min_random_number_desired > max_random_number:\n",
+    "#             print(\"WARNING: Cannot have min number as {} and split {} in {} subsections\".format(min_random_number_desired, number, number_of_subsections))\n",
+    "            number_of_subsections = int(np.floor(number/min_random_number_desired))\n",
+    "            return expression_split(number,number_of_subsections,distribution, seed, min_random_number_desired)\n",
+    "                    \n",
+    "        elif distribution == 'uniform':\n",
+    "            split_num1 = uniform.rvs(size=number_of_subsections, loc = 10, scale=20,random_state=seed)\n",
+    "        elif distribution == 'gamma':\n",
+    "            split_num1 = gamma.rvs(a=5, size=number_of_subsections,random_state=seed)\n",
+    "        elif distribution == 'exponential':\n",
+    "            split_num1 = expon.rvs(scale=1,loc=0,size=number_of_subsections,random_state=seed)\n",
+    "        elif distribution == 'poisson':\n",
+    "            split_num1 = poisson.rvs(mu=3, size=number_of_subsections,random_state=seed)\n",
+    "        try:\n",
+    "            split_num1 = [int(number*v/sum(split_num1)) for v in split_num1]\n",
+    "        except:\n",
+    "            # Error may occur when split_num1 = [0]\n",
+    "            expression_split(number, number_of_subsections, distribution, seed, min_random_number_desired)\n",
+    "        if len(split_num1) > 1:\n",
+    "            num_test = [1 for v in split_num1 if v <= min_random_number_desired]\n",
+    "            if sum(num_test) > int(0.25*len(split_num1)):\n",
+    "                return expression_split(number,int(number_of_subsections*0.75),distribution, seed, min_random_number_desired)\n",
+    "        for i in range(len(split_num1)):\n",
+    "            if split_num1[i] < min_random_number_desired:\n",
+    "                split_num1[i] = min_random_number_desired                    \n",
+    "        split_num1[-1] = number-sum(split_num1[:-1])\n",
+    "        if sum([1 for v in split_num1 if v<=0]) >= 1:\n",
+    "            return expression_split(number,int(number_of_subsections*0.75),distribution, seed, min_random_number_desired)\n",
+    "        return split_num1\n",
+    "    else:\n",
+    "#         print('WARNING : minimum depth is greater than provided number and can not be splitted.')\n",
+    "        expression_split(number,1,distribution, seed, number)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def update_gff(gff_df,out,ground_truth_filename):\n",
+    "    rna_truth = pd.read_csv(os.path.join(out,ground_truth_filename),sep = '\\t')\n",
+    "    rna_truth = rna_truth.set_index('RNA_ID')\n",
+    "    for idx in range(rna_truth.shape[0]):\n",
+    "        chr_start = str(rna_truth.iloc[idx,4])\n",
+    "        chr_end = str(rna_truth.iloc[idx,5])\n",
+    "        gff_df = gff_df[~gff_df.isin([chr_start,chr_end])]\n",
+    "        gff_df = gff_df.dropna()\n",
+    "    return gff_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed):\n",
+    "    \n",
+    "    if isinstance(seed, int):\n",
+    "        random.seed(seed)\n",
+    "    if write_mode == 'write':\n",
+    "        rna_ground_truth = open(os.path.join(out,out_file),'w')\n",
+    "        header = 'RNA_ID\\tImpure_Region\\tCigar_String\\tref_Sequence\\tsynthetic_sequence\\tchr\\tchr_start\\tchr_end\\tExpression_count\\n'\n",
+    "        rna_ground_truth.write(header)\n",
+    "    elif write_mode == 'append':\n",
+    "        rna_ground_truth = open(os.path.join(out,out_file),'a')\n",
+    "    \n",
+    "    chr_list = list(gff_df['chr'].unique())\n",
+    "    if no_mir_chr and n_seq_per_chr:\n",
+    "        for i in range(len(chr_list)):\n",
+    "            chr_name = chr_list[i] + '$'\n",
+    "            mir_complete_list = list(gff_df[gff_df['chr'].str.contains(chr_name)].index)   \n",
+    "            if mir_complete_list:\n",
+    "#                 print(i)\n",
+    "                total_exp = n_seq_per_chr[i]\n",
+    "                if not no_mir_chr[i] > len(mir_complete_list):                    \n",
+    "                    mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(no_mir_chr[i]))]\n",
+    "                    mir_list = [mir_complete_list[v] for v in mir_idx]\n",
+    "                else:                \n",
+    "                    if repeat:\n",
+    "                        complete_flag = True\n",
+    "                        while complete_flag == True:\n",
+    "                            mir_complete_list += mir_complete_list\n",
+    "                            if no_mir_chr[i] < len(mir_complete_list):\n",
+    "                                complete_flag = False\n",
+    "                        mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(no_mir_chr[i]))]\n",
+    "                        mir_list = [mir_complete_list[v] for v in mir_idx]\n",
+    "                    else:                    \n",
+    "                        mir_idx = [random.randint(0,len(mir_complete_list)-1) for v in range(int(mir_complete_list))]\n",
+    "                        mir_list = [mir_complete_list[v] for v in mir_idx]\n",
+    "\n",
+    "                complete_flag = True\n",
+    "                while complete_flag:\n",
+    "                    try:\n",
+    "                        expression_counts = expression_split(total_exp,len(mir_list),distribution,seed,depth)\n",
+    "                        if expression_counts:\n",
+    "                            complete_flag = False\n",
+    "                    except:\n",
+    "                        print('The number of RNAs and total available depth in %s  is %d and %d too less with error %s' %(chr_name,len(mir_list),total_exp,seq_error))\n",
+    "                        expression_counts = expression_split(total_exp,len(mir_list),distribution,seed,int(depth*0.5))\n",
+    "                        if expression_counts:\n",
+    "                            complete_flag = False\n",
+    "\n",
+    "                for mir,exp in zip(mir_list,expression_counts):\n",
+    "                    if seq_error == 'None':\n",
+    "                        mir_seq_new = rna_dict[mir]\n",
+    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                    elif seq_error == 'Seed_region':\n",
+    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                    elif seq_error == 'Outside_Seed_region':\n",
+    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                    elif seq_error == 'Both_region':\n",
+    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "\n",
+    "                    loc = mir_location(gff_df,mir,seed)\n",
+    "                    mir_depth = int(exp)\n",
+    "                    line = ''\n",
+    "                    line += mir + '\\t' + seq_error + '\\t' + mir_cigar + '\\t' + rna_dict[mir] + '\\t' + mir_seq_new + '\\t' + loc[0] + '\\t' + str(loc[1]) + '\\t' + str(loc[2]) + '\\t' + str(mir_depth) + '\\n'\n",
+    "                    rna_ground_truth.write(line)            \n",
+    "                    for dep in range(mir_depth):\n",
+    "                        fasta_seq.append('>' + mir)\n",
+    "                        fasta_seq.append(mir_seq_new)\n",
+    "\n",
+    "    return fasta_seq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def write_small_fastq_chunks(fasta_seq_chunk,out_file,counter,out_file_type,quality_char,adaptor,seed):\n",
+    "\n",
+    "    primer_nt = ['A','T','G','C']\n",
+    "    for line in fasta_seq_chunk:\n",
+    "        line1 = \"\"\n",
+    "        line_no = fasta_seq_chunk.index(line)\n",
+    "        if \">\" in line:\n",
+    "            if out_file_type == 'fastq':\n",
+    "                line1 += \"@\" + line[1:] + '-' + str(counter) + '\\n'\n",
+    "                seq = fasta_seq_chunk[line_no+1] + adaptor # \"TGGAATTCTCGGGTGCCAAGG\"\n",
+    "                primer_len = 75 - len(seq)\n",
+    "                primer_string = ''.join(char*random.randint(0,100) for char in primer_nt)\n",
+    "                primer_string = list(primer_string)\n",
+    "                primer_string = random.sample(primer_string,len(primer_string))\n",
+    "                primer_string = ''.join(c for c in primer_string)[:primer_len]\n",
+    "                seq += primer_string                \n",
+    "                line1 += fasta_seq_chunk[line_no+1] + adaptor + primer_string + '\\n' # \"TGGAATTCTCGGGTGCCAAGG\" + '\\n'\n",
+    "                line1 += '+' + '\\n'\n",
+    "                quality_string = ''.join(char*random.randint(0,15) for char in quality_char)\n",
+    "                quality_string = list(quality_string)\n",
+    "                quality_string = random.sample(quality_string,len(quality_string))\n",
+    "                quality_string = ''.join(c for c in quality_string)\n",
+    "                quality_flag = False\n",
+    "                while quality_flag:\n",
+    "                    if len(quality_string) < len(seq):\n",
+    "                        quality_string += quality_string\n",
+    "                        quality_string = list(quality_string)\n",
+    "                        quality_string = random.sample(quality_string,len(quality_string))\n",
+    "                        quality_string = ''.join(c for c in quality_string)\n",
+    "                    else:\n",
+    "                        quality_flag = True\n",
+    "                        break\n",
+    "                line1 += quality_string[:len(seq)] + '\\n'\n",
+    "                out_file.write(line1)\n",
+    "                counter += 1\n",
+    "            elif out_file_type == 'fasta':\n",
+    "                line1 += line + '-' + str(counter) + '\\n'\n",
+    "                line1 += fasta_seq_chunk[line_no+1] + adaptor + '\\n'\n",
+    "                out_file.write(line1)\n",
+    "                counter += 1\n",
+    "\n",
+    "#     out_file.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def execute_parallel_thread_for_file_write(fasta_seq_chunk,out_file,counter,out,out_file_type,quality_char,adaptor,seed):\n",
+    "    max_no_line = len(fasta_seq_chunk)\n",
+    "    if max_no_line > 10000:\n",
+    "        chunk_size = 10000\n",
+    "        no_of_splitted_file = int(np.ceil(max_no_line/chunk_size))\n",
+    "    else:\n",
+    "        chunk_size = max_no_line\n",
+    "        no_of_splitted_file = 1\n",
+    "    for j in range(no_of_splitted_file):\n",
+    "        if out_file_type == 'fastq':\n",
+    "            tmp_file_name = out_file + '_' + str(j) + '.fastq'\n",
+    "        else:\n",
+    "            tmp_file_name = out_file + '_' + str(j) + '.fasta'\n",
+    "            \n",
+    "        out_file1 = open(os.path.join(out,'temp',tmp_file_name),'w')\n",
+    "        fasta_seq_small_chunk = fasta_seq_chunk[j*chunk_size:(j+1)*chunk_size]\n",
+    "        counter1 = counter + j*len(fasta_seq_small_chunk)\n",
+    "        write_small_fastq_chunks(fasta_seq_small_chunk,out_file1,counter1,out_file_type,quality_char,adaptor,seed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def write_fastq(fasta_seq, adaptor, ascii_base, out, out_file_name,out_file_type,seed,parallel_thread):\n",
+    "                \n",
+    "    # Not considering phred score less then 20 so that mean quality score is always greater than 20 for smooth bioinformatics analysis\n",
+    "    if ascii_base == 33:\n",
+    "        quality_char = ['5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']\n",
+    "    elif ascii_base == 64:\n",
+    "        quality_char = ['T','U','V','W','X','Y','Z', '^',' ', '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']\n",
+    "    else:\n",
+    "        quality_char = []\n",
+    "    \n",
+    "    max_no_line = len(fasta_seq)\n",
+    "    no_of_splitted_file = parallel_thread\n",
+    "    chunk_size = int(np.ceil(max_no_line/no_of_splitted_file))\n",
+    "    print('Writing %d temporary files with chunk size of %d'%(no_of_splitted_file,chunk_size))\n",
+    "    if 'temp' in list(os.listdir(out)):\n",
+    "        cmd = 'rm -rf '+ os.path.join(out,'temp','..?* .[!.]*') + ' && rm -rf '+ os.path.join(out,'temp') + ' 2> /dev/null'\n",
+    "        os.system(cmd)\n",
+    "#         shutil.rmtree(os.path.join(out,'temp'))\n",
+    "        os.mkdir(os.path.join(out,'temp'))\n",
+    "    else:\n",
+    "        os.mkdir(os.path.join(out,'temp'))\n",
+    "        \n",
+    "    for j in range(no_of_splitted_file):\n",
+    "        tmp_file_name = 'tmp_'+str(j)            \n",
+    "        out_file = tmp_file_name \n",
+    "        fasta_seq_chunk = fasta_seq[j*chunk_size:(j+1)*chunk_size]\n",
+    "        if j >=1:\n",
+    "            counter = j*len(fasta_seq_chunk)\n",
+    "            print('------------------------------')\n",
+    "            print('writing %d chunk'%j)\n",
+    "            threading.Thread(target=execute_parallel_thread_for_file_write,args=(fasta_seq_chunk,out_file,counter,out,out_file_type,quality_char,adaptor,seed,)).start()\n",
+    "        else:\n",
+    "            pass\n",
+    "\n",
+    "    counter = 0\n",
+    "    tmp_file_name = 'tmp_'+str(0)\n",
+    "    out_file = tmp_file_name \n",
+    "    fasta_seq_chunk = fasta_seq[0*chunk_size:(0+1)*chunk_size]\n",
+    "    execute_parallel_thread_for_file_write(fasta_seq_chunk,out_file,counter,out,out_file_type,quality_char,adaptor,seed)\n",
+    "    time.sleep(20)\n",
+    "    print('-----------------------------------------------------------------------')\n",
+    "    print('All chunks are written. Now merging all the temporary files and compressing it.')\n",
+    "    if out_file_type == 'fastq':\n",
+    "        cmd = 'gzip '+ os.path.join(out,'temp') +'/tmp_*.fastq -c > ' + os.path.join(out,out_file_name)\n",
+    "    else:\n",
+    "        cmd = 'gzip '+ os.path.join(out,'temp') + '/tmp_*.fasta -c > ' + os.path.join(out,out_file_name)\n",
+    "    os.system(cmd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_synthetic_data(fa_file,total_no_seq,std_seq_percent,seed_error_percent,xseed_error_percent, \n",
+    "                            both_error_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n",
+    "                            gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed ,\n",
+    "                            no_mismatch_xseed,parallel_thread):\n",
+    "          \n",
+    "    file_read = open(fa_file).read().split('\\n')\n",
+    "    file_read = file_read[:-1]\n",
+    "    rna_dict = {}\n",
+    "    for line in file_read:\n",
+    "        if \">\" in line:\n",
+    "            line_no = file_read.index(line)\n",
+    "            rna_dict[line[1:]] = file_read[line_no+1]\n",
+    "\n",
+    "    fasta_seq = []\n",
+    "    # for pure sequences only\n",
+    "    gff_df = gff(gff_file,mir_type)\n",
+    "    no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, std_seq_percent, depth,seed,gff_df)    \n",
+    "    fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'None', out, ground_truth_filename,'write',repeat,distribution, no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "    print('****Pure Sequence are generated.****')\n",
+    "\n",
+    "    # for seed region error sequences only\n",
+    "    if not repeat:\n",
+    "        gff_df = update_gff(gff_df,out,ground_truth_filename)  # Update gff dataframe so that there is no repitition.\n",
+    "    no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, seed_error_percent, depth,seed,gff_df)\n",
+    "    fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Seed_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "    print('****Sequence with error in seed region are generated.****')\n",
+    "\n",
+    "    # for outside seed region error sequences only\n",
+    "    if not repeat:\n",
+    "        gff_df = update_gff(gff_df,out,ground_truth_filename)\n",
+    "    no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, xseed_error_percent, depth,seed,gff_df)\n",
+    "    fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Outside_Seed_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed)\n",
+    "    print('****Sequence with error in xseed region are generated.****')\n",
+    "\n",
+    "    # for both seed region error and outside seed region error sequences\n",
+    "    if not repeat:\n",
+    "        gff_df = update_gff(gff_df,out,ground_truth_filename)\n",
+    "    no_mir_chr, n_seq_per_chr = sequence_calculation(total_no_seq, both_error_percent, depth,seed,gff_df)\n",
+    "    fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Both_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed)\n",
+    "    print('****Sequence with error in both seed and xseed region are generated.****')\n",
+    "\n",
+    "    write_fastq(fasta_seq, adaptor, ascii_base, out, out_file_name,out_file_type,seed,parallel_thread)\n",
+    "\n",
+    "#     merge similar synthetic sequences\n",
+    "    ground_truth_df = pd.read_csv(os.path.join(out,ground_truth_filename),sep='\\t')\n",
+    "    ground_truth_df = ground_truth_df.groupby(['RNA_ID','Impure_Region','Cigar_String','ref_Sequence','synthetic_sequence','chr','chr_start','chr_end']).sum().reset_index()\n",
+    "    ground_truth_df = ground_truth_df.sort_values('Impure_Region')\n",
+    "    ground_truth_df.to_csv(os.path.join(out,ground_truth_filename),encoding='utf-8',index=False)\n",
+    "\n",
+    "    # deleting temporary folder\n",
+    "    cmd = 'rm -rf '+ os.path.join(out,'temp','..?* .[!.]*') + ' && rm -rf '+ os.path.join(out,'temp') + ' 2> /dev/null'\n",
+    "    os.system(cmd)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n",
+    "                       gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n",
+    "                       parallel_thread,*argv):\n",
+    "    start = time.time()\n",
+    "    \n",
+    "    if no_mismatch_seed >= 6:\n",
+    "        sys.exit(\"Maximum number of mismatch in seed region should not be more than 6.\")\n",
+    "    \n",
+    "    if no_mismatch_xseed >= 12:\n",
+    "        sys.exit(\"Maximum number of mismatch in xseed region should not be more than 12.\")\n",
+    "    \n",
+    "    if len(list(argv)) == 1:\n",
+    "        non_rna = list(argv)[0]\n",
+    "        seed_error_percent = int(non_rna/3)\n",
+    "        xseed_error_percent = int(non_rna/3)\n",
+    "        both_error_percent = non_rna - seed_error_percent - xseed_error_percent\n",
+    "        \n",
+    "        generate_synthetic_data(fa_file,total_no_seq,std_seq_percent,seed_error_percent,xseed_error_percent, \n",
+    "                                both_error_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n",
+    "                                gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed ,\n",
+    "                                no_mismatch_xseed,parallel_thread)\n",
+    "        end = time.time()\n",
+    "        time_taken = '%.2f' % ((end-start)/60)\n",
+    "        print(f\"Runtime of the program is {time_taken} minutes\")\n",
+    "        \n",
+    "        \n",
+    "    elif len(list(argv)) == 3:  \n",
+    "        seed_error_percent,xseed_error_percent,both_error_percent = list(argv)\n",
+    "        generate_synthetic_data(fa_file,total_no_seq,std_seq_percent,seed_error_percent,xseed_error_percent, \n",
+    "                                both_error_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n",
+    "                                gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n",
+    "                                parallel_thread)\n",
+    "        end = time.time()\n",
+    "        time_taken = '%.2f' % ((end-start)/60)\n",
+    "        print(f\"Runtime of the program is {time_taken} minutes\")\n",
+    "    else:\n",
+    "        sys.exit(\"Provide either non-RNA fraction or seed,xseed error fractions.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Change the path of required files**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_path = os.getcwd()\n",
+    "mir_file = '/mnt/miRSim/refs/mature_high_conf_hsa.fa'\n",
+    "mir_gff = '/mnt/miRSim/refs/hsa_high_conf.gff3'\n",
+    "adaptor = 'TGGAATTCTCGGGTGCCAAGG'\n",
+    "parallel_thread = 12\n",
+    "pir_file = '/mnt/miRSim/refs/piRNAdb.hsa.v1_7_5.fa'\n",
+    "pir_gff = '/mnt/miRSim/refs/pirnadb.hg38.gff3'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "****Pure Sequence are generated.****\n",
+      "****Sequence with error in seed region are generated.****\n",
+      "****Sequence with error in xseed region are generated.****\n",
+      "****Sequence with error in both seed and xseed region are generated.****\n",
+      "Writing 12 temporary files with chunk size of 3334\n",
+      "------------------------------\n",
+      "writing 1 chunk\n",
+      "------------------------------\n",
+      "writing 2 chunk\n",
+      "------------------------------\n",
+      "writing 3 chunk\n",
+      "------------------------------\n",
+      "writing 4 chunk\n",
+      "------------------------------\n",
+      "writing 5 chunk\n",
+      "------------------------------\n",
+      "writing 6 chunk\n",
+      "------------------------------\n",
+      "writing 7 chunk\n",
+      "------------------------------\n",
+      "writing 8 chunk\n",
+      "------------------------------\n",
+      "writing 9 chunk\n",
+      "------------------------------\n",
+      "writing 10 chunk\n",
+      "------------------------------\n",
+      "writing 11 chunk\n",
+      "-----------------------------------------------------------------------\n",
+      "All chunks are written. Now merging all the temporary files and compressing it.\n",
+      "Runtime of the program is 0.47 minutes\n"
+     ]
+    }
+   ],
+   "source": [
+    "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz','mirna_ground_truth.csv',\n",
+    "                        mir_gff,'mirna','fastq',True,'poisson','None',adaptor,2,2,parallel_thread,10,10,10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "hide_input": false,
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  },
+  "latex_envs": {
+   "LaTeX_envs_menu_present": true,
+   "autoclose": false,
+   "autocomplete": true,
+   "bibliofile": "biblio.bib",
+   "cite_by": "apalike",
+   "current_citInitial": 1,
+   "eqLabelWithNumbers": true,
+   "eqNumInitial": 1,
+   "hotkeys": {
+    "equation": "Ctrl-E",
+    "itemize": "Ctrl-I"
+   },
+   "labels_anchors": false,
+   "latex_user_defs": false,
+   "report_style_numbering": false,
+   "user_envs_cfg": false
+  },
+  "nbTranslate": {
+   "displayLangs": [
+    "*"
+   ],
+   "hotkey": "alt-t",
+   "langInMainMenu": true,
+   "sourceLang": "en",
+   "targetLang": "fr",
+   "useGoogleTranslate": true
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/requirements.txt b/requirements.txt
index 1ff3844..f950ac3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ scipy==1.4.1
 threaded==4.1.0
 numpy==1.18.1
 pandas==1.0.1
+random
\ No newline at end of file