unique identifier added

vivekruhela · Apr 22, 2022 · 30a065b · 30a065b
1 parent 1c8d33f
commit 30a065b
Show file tree

Hide file tree

Showing 6 changed files with 917 additions and 96 deletions.
diff --git a/Notebook/miRSim.ipynb b/Notebook/miRSim.ipynb
@@ -19,46 +19,14 @@
     "from scipy.stats import gamma\n",
     "from scipy.stats import uniform\n",
     "from scipy.stats import expon\n",
-    "from scipy.stats import poisson"
+    "from scipy.stats import poisson\n",
+    "from tqdm import tqdm"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c1a9746d75674500bca4c8099f5f1bb2",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/vivek/anaconda3/lib/python3.7/site-packages/tqdm/std.py:648: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
-      "  from pandas import Panel\n"
-     ]
-    }
-   ],
-   "source": [
-    "from tqdm import tqdm_notebook as tqdm\n",
-    "tqdm().pandas()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
    "outputs": [],
    "source": [
     "def alter_nt(seq_seed,position):\n",
@@ -76,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -118,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -140,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -184,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -246,7 +214,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_sequence(rna_dict, seq):\n",
+    "    all_sequences = [v for _,v in rna_dict.items()]\n",
+    "    if seq.upper() in all_sequences:\n",
+    "        unique_flag = False\n",
+    "        return unique_flag\n",
+    "    else:\n",
+    "        unique_flag = True\n",
+    "        return unique_flag    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -334,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -351,17 +335,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def generate_sequence(fasta_seq, gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, seq_error, out, out_file,write_mode,repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed):\n",
     "    \n",
@@ -415,30 +391,46 @@
     "                        mir_seq_new = rna_dict[mir]\n",
     "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
     "                    elif seq_error == 'Seed_region':\n",
-    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
-    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                        unique_flag = False\n",
+    "                        while unique_flag == False:\n",
+    "                            mir_seq_new = sequence_alteration(rna_dict[mir],'seed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                            unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
     "                    elif seq_error == 'Outside_Seed_region':\n",
-    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
-    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                        unique_flag= False\n",
+    "                        while unique_flag == False:\n",
+    "                            mir_seq_new = sequence_alteration(rna_dict[mir],'xseed',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                            unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
     "                    elif seq_error == 'Both_region':\n",
-    "                        mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n",
-    "                        mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                        unique_flag= False\n",
+    "                        while unique_flag == False:\n",
+    "                            mir_seq_new = sequence_alteration(rna_dict[mir],'both',no_mismatch_seed , no_mismatch_xseed, seed)\n",
+    "                            mir_cigar = cigar_generation(rna_dict[mir],mir_seq_new)\n",
+    "                            unique_flag = check_sequence(rna_dict,mir_seq_new)\n",
     "\n",
     "                    loc = mir_location(gff_df,mir,seed)\n",
     "                    mir_depth = int(exp)\n",
     "                    line = ''\n",
     "                    line += mir + '\\t' + seq_error + '\\t' + mir_cigar + '\\t' + rna_dict[mir] + '\\t' + mir_seq_new + '\\t' + loc[0] + '\\t' + str(loc[1]) + '\\t' + str(loc[2]) + '\\t' + str(mir_depth) + '\\n'\n",
     "                    rna_ground_truth.write(line)            \n",
     "                    for dep in range(mir_depth):\n",
-    "                        fasta_seq.append('>' + mir)\n",
+    "                        if seq_error == 'None':\n",
+    "                            fasta_seq.append('>' + mir + '_no_alterations')\n",
+    "                        if seq_error == 'Seed_region':\n",
+    "                            fasta_seq.append('>' + mir + '_seed_alterations')\n",
+    "                        if seq_error == 'Seed_region':\n",
+    "                            fasta_seq.append('>' + mir + '_xseed_alterations')\n",
+    "                        if seq_error == 'Both_region':\n",
+    "                            fasta_seq.append('>' + mir + '_both_seed_&_xseed_alterations')\n",
     "                        fasta_seq.append(mir_seq_new)\n",
     "\n",
     "    return fasta_seq"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -488,7 +480,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,7 +506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -569,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -630,13 +622,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,\n",
-    "                       gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed , no_mismatch_xseed,\n",
-    "                       parallel_thread,*argv):\n",
+    "def miRSim_main_module(fa_file,total_no_seq,std_seq_percent,depth,ascii_base,out,\n",
+    "                       out_file_name,ground_truth_filename,gff_file,mir_type,\n",
+    "                       out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed,\n",
+    "                        no_mismatch_xseed,parallel_thread,*argv):\n",
     "    start = time.time()\n",
     "    \n",
     "    if no_mismatch_seed >= 6:\n",
@@ -682,22 +675,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
     "save_path = os.getcwd()\n",
-    "mir_file = '/mnt/miRSim/refs/mature_high_conf_hsa.fa'\n",
-    "mir_gff = '/mnt/miRSim/refs/hsa_high_conf.gff3'\n",
+    "mir_file = '../refs/mature_high_conf_hsa.fa'\n",
+    "mir_gff = '../refs/hsa_high_conf.gff3'\n",
     "adaptor = 'TGGAATTCTCGGGTGCCAAGG'\n",
-    "parallel_thread = 12\n",
-    "pir_file = '/mnt/miRSim/refs/piRNAdb.hsa.v1_7_5.fa'\n",
-    "pir_gff = '/mnt/miRSim/refs/pirnadb.hg38.gff3'"
+    "parallel_thread = 10\n",
+    "pir_file = '../refs/piRNAdb.hsa.v1_7_5.fa'\n",
+    "pir_gff = '../refs/pirnadb.hg38.gff3'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -708,7 +701,7 @@
       "****Sequence with error in seed region are generated.****\n",
       "****Sequence with error in xseed region are generated.****\n",
       "****Sequence with error in both seed and xseed region are generated.****\n",
-      "Writing 12 temporary files with chunk size of 3334\n",
+      "Writing 10 temporary files with chunk size of 4000\n",
       "------------------------------\n",
       "writing 1 chunk\n",
       "------------------------------\n",
@@ -727,19 +720,16 @@
       "writing 8 chunk\n",
       "------------------------------\n",
       "writing 9 chunk\n",
-      "------------------------------\n",
-      "writing 10 chunk\n",
-      "------------------------------\n",
-      "writing 11 chunk\n",
       "-----------------------------------------------------------------------\n",
       "All chunks are written. Now merging all the temporary files and compressing it.\n",
-      "Runtime of the program is 0.47 minutes\n"
+      "Runtime of the program is 0.52 minutes\n"
      ]
     }
    ],
    "source": [
-    "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz','mirna_ground_truth.csv',\n",
-    "                        mir_gff,'mirna','fastq',True,'poisson','None',adaptor,2,2,parallel_thread,10,10,10)"
+    "miRSim_main_module(mir_file,50000,10,10,33,save_path,'mirna_raw_data.fastq.gz',\n",
+    "                   'mirna_ground_truth.csv',mir_gff,'mirna','fastq',True,\n",
+    "                   'poisson','None',adaptor,2,2,parallel_thread,10,10,10)"
    ]
   },
   {
@@ -767,7 +757,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.9.7"
   },
   "latex_envs": {
    "LaTeX_envs_menu_present": true,

diff --git a/README.md b/README.md
@@ -193,22 +193,19 @@ Total Number of sequence = 500000
 ```
 
 In order to generate such data you need to call miRSim module separately for each RNA type using following commands `(assuming minimum depth=default, file_type=fastq, encoding_quality=33, adaptor=default)`:
+**For miRNA synthetic data**
 
 ```
-For miRNA synthetic data
-
 python miRSim.py -i refs/mature_high_conf_hsa.fa -n mirna_raw_data.fastq.gz -g mirna_ground_truth.csv -gff refs/hsa_high_conf.gff3 -t 500000 -st 20 -s 10 -x 10 -b 5 -se 1001 -th 6 -rna miRNA
 ```
+**For piRNA synthetic data**
 
 ```
-For piRNA synthetic data
-
 python miRSim.py -i refs/piRNAdb.hsa.v1_7_5.fa -n pirna_raw_data.fastq.gz -g pirna_ground_truth.csv -gff refs/pirnadb.hg38.gff3 -t 500000 -st 10 -s 10 -x 5 -b 5 -se 1001 -th 6 -rna piRNA
 ```
+**For novel miRNA synthetic data**
 
 ```
-For novel miRNA synthetic data
-
 python miRSim.py -i refs/final_novel_seq_filtered.fa -n novel_mirna_raw_data.fastq.gz -g novel_mirna_ground_truth.csv -gff refs/novel_gff.gff3 -t 500000 -st 10 -s 10 -x 3 -b 2 -se 1001 -th 6 -rna novelRNA
 ```
 After generating synthetic data for each individual RNA, merge these fastq files and their ground truth csv. e.g.

diff --git a/check_sequence.py b/check_sequence.py
@@ -0,0 +1,10 @@
+#!/usr/bin/python
+
+def check_sequence(rna_dict, seq):
+    all_sequences = [v for _,v in rna_dict.items()]
+    if seq.upper() in all_sequences:
+        unique_flag = False
+        return unique_flag
+    else:
+        unique_flag = True
+        return unique_flag