-
Notifications
You must be signed in to change notification settings - Fork 0
/
OutputConverter.py
50 lines (48 loc) · 2.03 KB
/
OutputConverter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from collections import Counter, defaultdict
from tqdm.auto import tqdm
from datetime import datetime
import json, re, ast
import pandas as pd
import numpy as np
class OutputConverter:
def __init__(self):
# self.text_dir = text_dir
# self.text = text
pass
def read_text(self, text_dir = None, text = None):
if text_dir is None and text is None:
raise ImportError("No text file or text string is provided.")
elif text is None:
with open(text_dir, 'r') as f:
text = f.readlines()
if "{" == text[0].strip()[0]:
self.result = text[0]
else:
self.result = "{"+text[0]
else:
self.result = "{"+text
return self.result
def fix_dict(self):
corrected_str = re.sub(r"(?<=[:{,])\s*'([^']*)'\s*(?=[:,}])", r'"\1"', self.result)
corrected_str = corrected_str.replace("'s","s")
#corrected_str = corrected_str.replace('"s',"s")#.replace("s'","s")
corrected_str = re.sub(r'(?<!\\)\'', '"', corrected_str)
before, after = corrected_str.split("phenotypes")
after = after[after.find('{')+1:].replace("}", "").strip()
after_split = after.split(",")
after_split_colon = [t.count(":") for t in after_split]
phenotype_collected = []
for i, colon_count in enumerate(after_split_colon):
if colon_count < 2 and after_split[i].count('"') == 2:
pass
else:
phenotype_collected.append(after_split[i])
phenotype_str = ",".join(phenotype_collected)
if before[-1].strip() == '"':
corrected_str = before.strip() + 'phenotypes": {' + phenotype_str + "}}"
elif before[-1].strip() == "'":
corrected_str = before.strip() + "phenotypes': {" + phenotype_str + "}}"
else:
corrected_str = before.strip() + '"phenotypes": {' + phenotype_str + "}}"
corrected_str = corrected_str.replace('":":','":')
return corrected_str