Preparing tag for changes with WEKA data splitter

alvarocarrera · Feb 17, 2014 · 39d03bc · 39d03bc
1 parent 7bee3b9
commit 39d03bc
Show file tree

Hide file tree

Showing 4 changed files with 707 additions and 151 deletions.
diff --git a/src/main/java/es/upm/dit/gsi/barmas/dataset/utils/WekaDatasetSplitter.java b/src/main/java/es/upm/dit/gsi/barmas/dataset/utils/WekaDatasetSplitter.java
@@ -0,0 +1,282 @@
+/*******************************************************************************
+ * Copyright (c) 2013 alvarocarrera Grupo de Sistemas Inteligentes - Universidad Politécnica de Madrid. (GSI-UPM)
+ * http://www.gsi.dit.upm.es/
+ * 
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the GNU Public License v2.0
+ * which accompanies this distribution, and is available at
+ * 
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
+ * 
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+ * 
+ * Contributors:
+ *     alvarocarrera - initial API and implementation
+ ******************************************************************************/
+/**
+ * es.upm.dit.gsi.barmas.kowlancz.dataset.utils.DatasetSplitter.java
+ */
+package es.upm.dit.gsi.barmas.dataset.utils;
+
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.logging.Logger;
+
+import weka.core.Instance;
+import weka.core.Instances;
+import weka.core.converters.CSVSaver;
+import weka.core.converters.ConverterUtils.DataSource;
+
+import com.csvreader.CsvReader;
+import com.csvreader.CsvWriter;
+
+/**
+ * Project: barmas File:
+ * es.upm.dit.gsi.barmas.dataset.utils.kowlancz.DatasetSplitter.java
+ * 
+ * Grupo de Sistemas Inteligentes Departamento de Ingeniería de Sistemas
+ * Telemáticos Universidad Politécnica de Madrid (UPM)
+ * 
+ * @author alvarocarrera
+ * @email [email protected]
+ * @twitter @alvarocarrera
+ * @date 31/10/2013
+ * @version 0.2
+ * 
+ */
+public class WekaDatasetSplitter {
+
+	/**
+	 * @param args
+	 * @throws Exception
+	 */
+	public static void main(String[] args) throws Exception {
+
+		WekaDatasetSplitter splitter = new WekaDatasetSplitter();
+
+		String originalDatasetPath = "src/main/resources/dataset/kowlancz/CZ02/CZ02-dataset.csv";
+		String outputParentDir = "src/main/resources/output/kowlancz-CZ02";
+		Logger logger = Logger.getLogger(WekaDatasetSplitter.class.getSimpleName());
+
+		// Experiment 1
+		String outputDir = outputParentDir;
+		splitter.splitDataset(3, 4, originalDatasetPath, outputDir, "CZ02", logger);
+
+		// Experiment 2
+		outputDir = outputParentDir;
+		splitter.splitDataset(10, 8, originalDatasetPath, outputDir, "CZ02", logger);
+
+	}
+
+	/**
+	 * This method splits the original dataset in many small datasets for a
+	 * given number of agents.
+	 * 
+	 * This method uses folds generated by WEKA and appends the language at the
+	 * end of the datasets (i.e. the essentials).
+	 * 
+	 * @param folds
+	 *            KFold number
+	 * @param agents
+	 *            number of agents to split the original dataset
+	 * @param originalDatasetPath
+	 * @param outputDir
+	 * @param central
+	 *            true to create a bayescentral dataset that joint all agent
+	 *            data
+	 * @param scenario
+	 * @param iteration
+	 * @throws Exception
+	 */
+	public void splitDataset(int folds, int agents, String originalDatasetPath, String outputDir,
+			String scenario, Logger logger) {
+
+		int ratioint = (int) ((1 / (double) folds) * 100);
+		double roundedratio = ((double) ratioint) / 100;
+
+		// Look for essentials
+		List<String[]> essentials = this.getEssentials(originalDatasetPath, logger);
+
+		for (int fold = 0; fold < folds; fold++) {
+			String outputDirWithRatio = outputDir + "/" + roundedratio + "testRatio/iteration-"
+					+ fold;
+			File dir = new File(outputDirWithRatio);
+			if (!dir.exists() || !dir.isDirectory()) {
+				dir.mkdirs();
+			}
+
+			logger.finer("--> splitDataset()");
+			logger.fine("Creating experiment.info...");
+			this.createExperimentInfoFile(folds, agents, originalDatasetPath, outputDirWithRatio,
+					scenario, logger);
+
+			try {
+
+				Instances originalData = this.getDataFromCSV(originalDatasetPath);
+
+				// TestDataSet
+				Instances testData = originalData.testCV(folds, fold);
+				CSVSaver saver = new CSVSaver();
+				saver.setInstances(testData);
+				saver.setFile(new File(outputDirWithRatio + File.separator + "test-dataset.csv"));
+				saver.writeBatch();
+
+				// BayesCentralDataset
+				Instances trainData = originalData.trainCV(folds, fold);
+				saver.resetOptions();
+				saver.setInstances(trainData);
+				saver.setFile(new File(outputDirWithRatio + File.separator
+						+ "bayes-central-dataset.csv"));
+				saver.writeBatch();
+
+				// Agent datasets
+				CsvReader csvreader = new CsvReader(new FileReader(new File(originalDatasetPath)));
+				csvreader.readHeaders();
+				String[] headers = csvreader.getHeaders();
+				csvreader.close();
+
+				HashMap<String, CsvWriter> writers = new HashMap<String, CsvWriter>();
+				String agentsDatasetsDir = outputDirWithRatio + File.separator + agents + "agents";
+				File f = new File(agentsDatasetsDir);
+				if (!f.isDirectory()) {
+					f.mkdirs();
+				}
+				for (int i = 0; i < agents; i++) {
+					String fileName = agentsDatasetsDir + File.separator + "agent-" + i
+							+ "-dataset.csv";
+					CsvWriter writer = new CsvWriter(new FileWriter(fileName), ',');
+					writer.writeRecord(headers);
+					writers.put("AGENT" + i, writer);
+					logger.fine("AGENT" + i + " dataset created.");
+				}
+
+				int agentCounter = 0;
+				for (int i = 0; i < trainData.numInstances(); i++) {
+					Instance instance = trainData.instance(i);
+					CsvWriter writer = writers.get("AGENT" + agentCounter);
+					String[] row = new String[instance.numAttributes()];
+					for (int a = 0; a < instance.numAttributes(); a++) {
+						row[a] = instance.stringValue(a);
+					}
+					writer.writeRecord(row);
+					agentCounter++;
+					if (agentCounter == agents) {
+						agentCounter = 0;
+					}
+				}
+
+				// Append essentials to all
+				String fileName = outputDirWithRatio + File.separator + "bayes-central-dataset.csv";
+				CsvWriter w = new CsvWriter(new FileWriter(fileName, true), ',');
+				writers.put("CENTRAL", w);
+				for (String[] essential : essentials) {
+					for (CsvWriter writer : writers.values()) {
+						writer.writeRecord(essential);
+					}
+				}
+				for (CsvWriter writer : writers.values()) {
+					writer.close();
+				}
+
+			} catch (Exception e) {
+				logger.severe("Exception while splitting dataset. ->");
+				logger.severe(e.getMessage());
+				System.exit(1);
+			}
+
+			logger.finest("Dataset for fold " + fold + " created.");
+		}
+
+		logger.finer("<-- splitDataset()");
+	}
+
+	/**
+	 * @param ratio
+	 * @param agents
+	 * @param originalDatasetPath
+	 * @param outputDir
+	 * @param central
+	 * @param scenario
+	 */
+	private void createExperimentInfoFile(int folds, int agents, String originalDatasetPath,
+			String outputDir, String scenario, Logger logger) {
+
+		try {
+			String fileName = outputDir + "/" + agents + "agents/experiment.info";
+			File file = new File(fileName);
+			File parent = file.getParentFile();
+			if (!parent.exists()) {
+				parent.mkdirs();
+			}
+			FileWriter fw = new FileWriter(file);
+			fw.write("Scenario: " + scenario + "\n");
+			fw.write("Number of folds: " + Integer.toString(folds) + "\n");
+			fw.write("Number of Agents: " + Integer.toString(agents) + "\n");
+			fw.write("Original dataset: " + originalDatasetPath + "\n");
+			fw.write("Experiment dataset folder: " + outputDir + "\n");
+			fw.close();
+
+		} catch (Exception e) {
+			logger.severe(e.getMessage());
+			System.exit(1);
+		}
+	}
+
+	/**
+	 * @param originalDatasetPath
+	 * @param scenario
+	 * @return
+	 */
+	private List<String[]> getEssentials(String originalDatasetPath, Logger logger) {
+		// Find essentials
+		List<String[]> essentials = new ArrayList<String[]>();
+		HashMap<String, List<String>> nodesAndStates = new HashMap<String, List<String>>();
+		try {
+			// Look for all possible states
+			Reader fr = new FileReader(originalDatasetPath);
+			CsvReader reader = new CsvReader(fr);
+			reader.readHeaders();
+			String[] headers = reader.getHeaders();
+			for (String header : headers) {
+				nodesAndStates.put(header, new ArrayList<String>());
+			}
+			String[] values;
+			while (reader.readRecord()) {
+				values = reader.getValues();
+				for (int i = 0; i < values.length; i++) {
+					if (!nodesAndStates.get(headers[i]).contains(values[i])) {
+						nodesAndStates.get(headers[i]).add(values[i]);
+						if (!essentials.contains(values)) {
+							essentials.add(values);
+						}
+					}
+				}
+			}
+
+			reader.close();
+
+			logger.fine("Number of Essentials: " + essentials.size());
+		} catch (Exception e) {
+			logger.severe(e.getMessage());
+			System.exit(1);
+		}
+		return essentials;
+	}
+
+	/**
+	 * @param csvFilePath
+	 * @return
+	 * @throws Exception
+	 */
+	private Instances getDataFromCSV(String csvFilePath) throws Exception {
+		DataSource source = new DataSource(csvFilePath);
+		Instances data = source.getDataSet();
+		data.setClassIndex(data.numAttributes() - 1);
+		return data;
+	}
+}