-
Notifications
You must be signed in to change notification settings - Fork 0
/
preProcess.py
115 lines (87 loc) · 3.63 KB
/
preProcess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
#coding:utf-8
'''
data
'''
import os
import sys
import re
import pandas as pd
#os.environ["model= 'xxlarge' --do_train=1 --do_predict=0 --file_pre='a'"] = "0,1,2"
dat_path = '../ALbert/data/'
train_file = os.path.join(dat_path, 'training.csv')
tests = ['testset-levela.tsv.csv',
'testset-levelb.tsv.csv',
'testset-levelc.tsv.csv',
]
# Process Testset,
# field:id,tweet
#
def process_test_dat (filename):
df = pd.read_csv(filename)
df['label'] = 0
df['tweet'] = df['tweet'].str.replace(r'@user', '') #, regex=True
df = df[['tweet','label']]
nfilename = os.path.splitext(filename)[0]
df.to_csv(nfilename, header=None, sep='\t', index=False)
print('save to file: %s' % nfilename)
# Process all test data
def ProcessAllTest ():
for fn in tests:
test_file = os.path.join(dat_path, fn)
print('preprocess testset file: %s' % test_file)
process_test_dat(test_file)
# Split dataset train:test:val = 8:1:1
def splitdataset (df, file_pre='dat' ):
print('processing... file: ', file_pre)
df = df.sample(frac=1.0) # random
cut_idx = int(round(0.2 * df.shape[0]))
df_test, df_train = df.iloc[:cut_idx], df.iloc[cut_idx:]
#print('train records:', df_train.shape[0])
df_train.to_csv(file_pre + '_train.tsv', header=None, sep='\t', index=False)
#再拆分test和val
cut_idx = int(round(0.5 * df_test.shape[0]))
df_test, df_val = df_test.iloc[:cut_idx], df_test.iloc[cut_idx:]
df_val.to_csv(file_pre + '_val.tsv', header=None, sep='\t', index=False)
df_test.to_csv(file_pre + '_test.tsv', header=None, sep='\t', index=False)
print('Records train:test:val = %d:%d:%d ' %
(df_train.shape[0], df_test.shape[0], df_val.shape[0]))
print('-'*40)
##Specifies that the column is mapped to the new column by the characteristic index value,newColumn
def MapNewColumn(df, oldcol, newcol, isdrop=1 , workpath = './'):
A = df[oldcol].value_counts().argsort()
print('[%s]Column value distribution:' % oldcol)
dict_oldcol = {'index':A.index,'values':A.values}
df_oldcol = pd.DataFrame(dict_oldcol)
df_oldcol.to_csv( os.path.join(workpath, 'MapNewColumn_%s.csv' % oldcol) )
print('[%s]Column value distribution saved' % oldcol)
# -----
df[newcol] = df[oldcol].map(A)
if isdrop:
df.drop(oldcol, axis=1, inplace=True)
return df
# Processing training data
def ProcessAllTrain():
# The field name: id,subtask_a,sbutask_b,subtask_c,tweet
# Read CSV file
nfilename = os.path.splitext(train_file)[0]
print('process training file'.center(40,'-'))
df_train = pd.read_csv(train_file)
df_train['tweet'] = df_train['tweet'].str.replace(r'@user', '')
#print(df_train.head())
df_a = df_train[['tweet','subtask_a']].copy()
df_a = MapNewColumn(df_a, 'subtask_a', 'subtask_a_id', isdrop=1 , workpath=dat_path)
splitdataset(df_a, file_pre= nfilename+'_a')
df_b = df_train[['tweet','sbutask_b']].copy()
df_b.dropna(subset=['sbutask_b'], inplace=True)
df_b = MapNewColumn(df_b, 'sbutask_b', 'subtask_b_id', isdrop=1 , workpath=dat_path)
splitdataset(df_b, file_pre= nfilename+'_b')
df_c = df_train[['tweet','subtask_c']].copy()
df_c.dropna(subset=['subtask_c'], inplace=True)
df_c = MapNewColumn(df_c, 'subtask_c', 'subtask_c_id', isdrop=1 , workpath=dat_path)
splitdataset(df_c, file_pre= nfilename+'_c')
print('All training file saved.')
if __name__ == '__main__':
pass
ProcessAllTest()
ProcessAllTrain()