-
Notifications
You must be signed in to change notification settings - Fork 2
/
Drop_it.py
58 lines (43 loc) · 3.1 KB
/
Drop_it.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import pickle as pk
import numpy as np
df = pd.read_csv("EVERYTHING.csv", usecols=['bookingStatus', 'status1Day', 'status1Month', 'status1Week', 'status2Days'])
df.dropna(how='any', inplace=True)
df['status1Day'] = df['status1Day'].str.rstrip(to_strip= "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
df['status1Week'] = df['status1Week'].str.rstrip(to_strip= "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
df['status1Month'] = df['status1Month'].str.rstrip(to_strip= "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
df['status2Days'] = df['status2Days'].str.rstrip(to_strip= "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
df['bookingStatus'] = df['bookingStatus'].str.rstrip(to_strip= "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
#Remove all RAC and make them 0
df['status1Day'].replace(regex=True, inplace=True, to_replace=r'[^W/L\d\s,].*', value= r'0')
df['status2Days'].replace(regex=True, inplace=True, to_replace=r'[^W/L\d\s,].*', value=r'0')
df['status1Week'].replace(regex=True, inplace=True, to_replace=r'[^W/L\d\s,].*', value=r'0')
df['status1Month'].replace(regex=True, inplace=True, to_replace=r'[^W/L\d\s,].*', value=r'0')
df['bookingStatus'].replace(regex=True, inplace=True, to_replace=r'[^W/L\d\s,].*', value=r'0')
#The 5 following lines replace all letters of the alphabet with nothing and spaces with backslash.
df['status1Day'].replace(regex=True, inplace=True, to_replace=[r'[a-zA-Z/]', r'\s' ], value=[r'', r'/'])
df['status2Days'].replace(regex=True, inplace=True, to_replace=[r'[a-zA-Z/]', r'\s' ], value=[r'', r'/'])
df['status1Week'].replace(regex=True, inplace=True, to_replace=[r'[a-zA-Z/]', r'\s' ], value=[r'', r'/'])
df['status1Month'].replace(regex=True, inplace=True, to_replace=[r'[a-zA-Z/]', r'\s' ], value=[r'', r'/'])
df['bookingStatus'].replace(regex=True, inplace=True, to_replace=[r'[a-zA-Z/]', r'\s' ], value=[r'', r'/'])
#These 5 following lines strip from the left hand side all numbers until it hits a non digit
df['status1Day'] = df['status1Day'].str.lstrip(to_strip= "123456789,")
df['status1Week'] = df['status1Week'].str.lstrip(to_strip= "123456789,")
df['status1Month'] = df['status1Month'].str.lstrip(to_strip= "123456789,")
df['status2Days'] = df['status2Days'].str.lstrip(to_strip= "123456789,")
df['bookingStatus'] = df['bookingStatus'].str.lstrip(to_strip= "123456789,")
#These commands below now remove every non digit from these columns
df['status1Day'].replace(regex=True, inplace=True, to_replace=r'\D', value=r'')
df['status2Days'].replace(regex=True, inplace=True, to_replace=r'\D', value=r'')
df['status1Week'].replace(regex=True, inplace=True, to_replace=r'\D', value=r'')
df['status1Month'].replace(regex=True, inplace=True, to_replace=r'\D', value=r'')
df['bookingStatus'].replace(regex=True, inplace=True, to_replace=r'\D', value=r'')
df.replace(r'\s+|^$', 0, regex=True, inplace=True)
df['status1Day'] = df['status1Day'].astype(int, raise_on_error=False)
df['status1Week'] = df['status1Week'].astype(int)
df['status1Month'] = df['status1Month'].astype(int)
df['status2Days'] = df['status2Days'].astype(int)
df['bookingStatus'] = df['bookingStatus'].astype(int)
#Write to csv
#df.to_csv("features_Anand.csv", index=False)
df.to_csv("features.csv", index=False)