-
Notifications
You must be signed in to change notification settings - Fork 5
/
1_xml_to_csv.py
140 lines (118 loc) · 6.35 KB
/
1_xml_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Note: substantial portions of this code, expecially the actual XML to CSV conversion, are credit to Dat Tran
# see his website here: https://towardsdatascience.com/how-to-train-your-own-object-detector-with-tensorflows-object-detector-api-bec72ecfe1d9
# and his GitHub here: https://github.com/datitran/raccoon_dataset/blob/master/xml_to_csv.py
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
# module level variables ##############################################################################################
# train and test directories
TRAINING_IMAGES_DIR = os.getcwd() + "/training_images/"
TEST_IMAGES_DIR = os.getcwd() + "/test_images/"
MIN_NUM_IMAGES_REQUIRED_FOR_TRAINING = 10
MIN_NUM_IMAGES_SUGGESTED_FOR_TRAINING = 100
MIN_NUM_IMAGES_REQUIRED_FOR_TESTING = 3
# output .csv file names/locations
TRAINING_DATA_DIR = os.getcwd() + "/" + "training_data"
TRAIN_CSV_FILE_LOC = TRAINING_DATA_DIR + "/" + "train_labels.csv"
EVAL_CSV_FILE_LOC = TRAINING_DATA_DIR + "/" + "eval_labels.csv"
#######################################################################################################################
def main():
if not checkIfNecessaryPathsAndFilesExist():
return
# end if
# if the training data directory does not exist, create it
try:
if not os.path.exists(TRAINING_DATA_DIR):
os.makedirs(TRAINING_DATA_DIR)
# end if
except Exception as e:
print("unable to create directory " + TRAINING_DATA_DIR + "error: " + str(e))
# end try
# convert training xml data to a single .csv file
print("converting xml training data . . .")
trainCsvResults = xml_to_csv(TRAINING_IMAGES_DIR)
trainCsvResults.to_csv(TRAIN_CSV_FILE_LOC, index=None)
print("training xml to .csv conversion successful, saved result to " + TRAIN_CSV_FILE_LOC)
# convert test xml data to a single .csv file
print("converting xml test data . . .")
testCsvResults = xml_to_csv(TEST_IMAGES_DIR)
testCsvResults.to_csv(EVAL_CSV_FILE_LOC, index=None)
print("test xml to .csv conversion successful, saved result to " + EVAL_CSV_FILE_LOC)
# end main
#######################################################################################################################
def checkIfNecessaryPathsAndFilesExist():
if not os.path.exists(TRAINING_IMAGES_DIR):
print('')
print('ERROR: the training images directory "' + TRAINING_IMAGES_DIR + '" does not seem to exist')
print('Did you set up the training images?')
print('')
return False
# end if
# get a list of all the .jpg / .xml file pairs in the training images directory
trainingImagesWithAMatchingXmlFile = []
for fileName in os.listdir(TRAINING_IMAGES_DIR):
if fileName.endswith(".jpg"):
xmlFileName = os.path.splitext(fileName)[0] + ".xml"
if os.path.exists(os.path.join(TRAINING_IMAGES_DIR, xmlFileName)):
trainingImagesWithAMatchingXmlFile.append(fileName)
# end if
# end if
# end for
# show an error and return false if there are no images in the training directory
if len(trainingImagesWithAMatchingXmlFile) <= 0:
print("ERROR: there don't seem to be any images and matching XML files in " + TRAINING_IMAGES_DIR)
print("Did you set up the training images?")
return False
# end if
# show an error and return false if there are not at least 10 images and 10 matching XML files in TRAINING_IMAGES_DIR
if len(trainingImagesWithAMatchingXmlFile) < MIN_NUM_IMAGES_REQUIRED_FOR_TRAINING:
print("ERROR: there are not at least " + str(MIN_NUM_IMAGES_REQUIRED_FOR_TRAINING) + " images and matching XML files in " + TRAINING_IMAGES_DIR)
print("Did you set up the training images?")
return False
# end if
# show a warning if there are not at least 100 images and 100 matching XML files in TEST_IMAGES_DIR
if len(trainingImagesWithAMatchingXmlFile) < MIN_NUM_IMAGES_SUGGESTED_FOR_TRAINING:
print("WARNING: there are not at least " + str(MIN_NUM_IMAGES_SUGGESTED_FOR_TRAINING) + " images and matching XML files in " + TRAINING_IMAGES_DIR)
print("At least " + str(MIN_NUM_IMAGES_SUGGESTED_FOR_TRAINING) + " image / xml pairs are recommended for bare minimum acceptable results")
# note we do not return false here b/c this is a warning, not an error
# end if
if not os.path.exists(TEST_IMAGES_DIR):
print('ERROR: TEST_IMAGES_DIR "' + TEST_IMAGES_DIR + '" does not seem to exist')
return False
# end if
# get a list of all the .jpg / .xml file pairs in the test images directory
testImagesWithAMatchingXmlFile = []
for fileName in os.listdir(TEST_IMAGES_DIR):
if fileName.endswith(".jpg"):
xmlFileName = os.path.splitext(fileName)[0] + ".xml"
if os.path.exists(os.path.join(TEST_IMAGES_DIR, xmlFileName)):
testImagesWithAMatchingXmlFile.append(fileName)
# end if
# end if
# end for
# show an error and return false if there are not at least 3 images and 3 matching XML files in TEST_IMAGES_DIR
if len(testImagesWithAMatchingXmlFile) <= 3:
print("ERROR: there are not at least " + str(MIN_NUM_IMAGES_REQUIRED_FOR_TESTING) + " image / xml pairs in " + TEST_IMAGES_DIR)
print("Did you separate out the test image / xml pairs from the training image / xml pairs?")
return False
# end if
return True
# end function
#######################################################################################################################
def xml_to_csv(path):
xml_list = []
for xml_file in glob.glob(path + '/*.xml'):
tree = ET.parse(xml_file)
root = tree.getroot()
for member in root.findall('object'):
value = (root.find('filename').text, int(root.find('size')[0].text), int(root.find('size')[1].text), member[0].text,
int(member[4][0].text), int(member[4][1].text), int(member[4][2].text), int(member[4][3].text))
xml_list.append(value)
column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
xml_df = pd.DataFrame(xml_list, columns=column_name)
return xml_df
# end function
#######################################################################################################################
if __name__ == "__main__":
main()