-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.py
79 lines (60 loc) · 2.06 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
import re
import re
import spacy
import os
def get_phone_numbers(string):
r = re.compile(
r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
phone_numbers = r.findall(string)
return [re.sub(r'\D', '', num) for num in phone_numbers]
def get_email_addresses(string):
r = re.compile(r'[\w\.-]+@[\w\.-]+')
return r.findall(string)
class Data:
def __init__(self, label, value):
self.label = label
self.value = value
# Load English tokenizer, tagger, parser and NER
# sourcery skip: avoid-builtin-shadow
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 3531630
file_name = 'C:/dev/outlook/outlook-insight-2022-nov.CSV'
isFile = os.path.isfile(file_name)
if not isFile:
raise "File not found"
lines = ""
list = []
print(f'File Size is {os.stat(file_name).st_size / (1024 * 1024)} MB')
with open(file_name, encoding="utf8") as file:
for _ in range(100000):
line = next(file).strip()
lines = lines + line
doc = nlp(lines)
for token in doc:
# #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
emails = get_email_addresses(token.text)
for email in emails:
if len(email) > 0:
d = Data("EMAIL", email)
if email not in list:
list.append(d)
df = pd.DataFrame(["", ""])
for entity in doc.ents:
# print(entity.text, entity.label_)
# print(entity.label_, entity.text, entity.text)
# dict.update( [(entity.label_),(entity.text)] )
d = Data(entity.label_, entity.text)
list.append(d)
# df = pd.DataFrame.from_dict(dict,)
print(len(list))
filtered = [e for e in list if e.label == 'EMAIL']
df = pd.DataFrame([t.__dict__ for t in filtered])
output_file = 'C:/dev/outlook/output-nov.CSV'
if (file_name.__contains__("dec")):
output_file = 'C:/dev/outlook/output-dec.CSV'
df.to_csv(output_file)
emails = pd.read_csv(output_file)
print(emails.dtypes)
var = emails.loc[emails['value'].str.contains("dmirs", case=False)]
print(var)