-
Notifications
You must be signed in to change notification settings - Fork 0
/
raw2plaintext.py
41 lines (31 loc) · 1.17 KB
/
raw2plaintext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#
# removes punctuation, digits etc from each file
# in directory and saves the result into a single
# text file.
#
import re
import nltk
import csv
import os
RAW_DATA_PATH = "data_raw"
PROCESSED_DATA_PATH = "data_processed"
OFNAME = os.path.join(PROCESSED_DATA_PATH, "data_processed.txt")
CSV_NAME = os.path.splitext(OFNAME)[0] + ".csv"
freqs = nltk.FreqDist()
with open(OFNAME, 'w') as output_file:
for filename in os.listdir(RAW_DATA_PATH):
with open(os.path.join(RAW_DATA_PATH, filename), 'r') as f:
text = f.read()
tokens = [e.lower() for e in map(str.strip,
re.split("([^\u0400-\u0500]+)",
text))
if len(e) > 0 and not re.match("[^\u0400-\u0500]", e)]
freqs = freqs + nltk.FreqDist(tokens)
out_str = ''.join(["%s " % token for token in tokens])
output_file.write(out_str)
# for token in tokens:
# output_file.write("%s " % token)
output_file.write('\n')
with open(CSV_NAME, 'w') as f:
writer = csv.writer(f)
writer.writerow(freqs.items())