Skip to content

Commit

Permalink
Preparing tag for changes with WEKA data splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
alvarocarrera committed Feb 17, 2014
1 parent 7bee3b9 commit 39d03bc
Show file tree
Hide file tree
Showing 4 changed files with 707 additions and 151 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
/*******************************************************************************
* Copyright (c) 2013 alvarocarrera Grupo de Sistemas Inteligentes - Universidad Politécnica de Madrid. (GSI-UPM)
* http://www.gsi.dit.upm.es/
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Public License v2.0
* which accompanies this distribution, and is available at
*
* http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
*
* Contributors:
* alvarocarrera - initial API and implementation
******************************************************************************/
/**
* es.upm.dit.gsi.barmas.kowlancz.dataset.utils.DatasetSplitter.java
*/
package es.upm.dit.gsi.barmas.dataset.utils;

import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.logging.Logger;

import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.CSVSaver;
import weka.core.converters.ConverterUtils.DataSource;

import com.csvreader.CsvReader;
import com.csvreader.CsvWriter;

/**
* Project: barmas File:
* es.upm.dit.gsi.barmas.dataset.utils.kowlancz.DatasetSplitter.java
*
* Grupo de Sistemas Inteligentes Departamento de Ingeniería de Sistemas
* Telemáticos Universidad Politécnica de Madrid (UPM)
*
* @author alvarocarrera
* @email [email protected]
* @twitter @alvarocarrera
* @date 31/10/2013
* @version 0.2
*
*/
public class WekaDatasetSplitter {

/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {

WekaDatasetSplitter splitter = new WekaDatasetSplitter();

String originalDatasetPath = "src/main/resources/dataset/kowlancz/CZ02/CZ02-dataset.csv";
String outputParentDir = "src/main/resources/output/kowlancz-CZ02";
Logger logger = Logger.getLogger(WekaDatasetSplitter.class.getSimpleName());

// Experiment 1
String outputDir = outputParentDir;
splitter.splitDataset(3, 4, originalDatasetPath, outputDir, "CZ02", logger);

// Experiment 2
outputDir = outputParentDir;
splitter.splitDataset(10, 8, originalDatasetPath, outputDir, "CZ02", logger);

}

/**
* This method splits the original dataset in many small datasets for a
* given number of agents.
*
* This method uses folds generated by WEKA and appends the language at the
* end of the datasets (i.e. the essentials).
*
* @param folds
* KFold number
* @param agents
* number of agents to split the original dataset
* @param originalDatasetPath
* @param outputDir
* @param central
* true to create a bayescentral dataset that joint all agent
* data
* @param scenario
* @param iteration
* @throws Exception
*/
public void splitDataset(int folds, int agents, String originalDatasetPath, String outputDir,
String scenario, Logger logger) {

int ratioint = (int) ((1 / (double) folds) * 100);
double roundedratio = ((double) ratioint) / 100;

// Look for essentials
List<String[]> essentials = this.getEssentials(originalDatasetPath, logger);

for (int fold = 0; fold < folds; fold++) {
String outputDirWithRatio = outputDir + "/" + roundedratio + "testRatio/iteration-"
+ fold;
File dir = new File(outputDirWithRatio);
if (!dir.exists() || !dir.isDirectory()) {
dir.mkdirs();
}

logger.finer("--> splitDataset()");
logger.fine("Creating experiment.info...");
this.createExperimentInfoFile(folds, agents, originalDatasetPath, outputDirWithRatio,
scenario, logger);

try {

Instances originalData = this.getDataFromCSV(originalDatasetPath);

// TestDataSet
Instances testData = originalData.testCV(folds, fold);
CSVSaver saver = new CSVSaver();
saver.setInstances(testData);
saver.setFile(new File(outputDirWithRatio + File.separator + "test-dataset.csv"));
saver.writeBatch();

// BayesCentralDataset
Instances trainData = originalData.trainCV(folds, fold);
saver.resetOptions();
saver.setInstances(trainData);
saver.setFile(new File(outputDirWithRatio + File.separator
+ "bayes-central-dataset.csv"));
saver.writeBatch();

// Agent datasets
CsvReader csvreader = new CsvReader(new FileReader(new File(originalDatasetPath)));
csvreader.readHeaders();
String[] headers = csvreader.getHeaders();
csvreader.close();

HashMap<String, CsvWriter> writers = new HashMap<String, CsvWriter>();
String agentsDatasetsDir = outputDirWithRatio + File.separator + agents + "agents";
File f = new File(agentsDatasetsDir);
if (!f.isDirectory()) {
f.mkdirs();
}
for (int i = 0; i < agents; i++) {
String fileName = agentsDatasetsDir + File.separator + "agent-" + i
+ "-dataset.csv";
CsvWriter writer = new CsvWriter(new FileWriter(fileName), ',');
writer.writeRecord(headers);
writers.put("AGENT" + i, writer);
logger.fine("AGENT" + i + " dataset created.");
}

int agentCounter = 0;
for (int i = 0; i < trainData.numInstances(); i++) {
Instance instance = trainData.instance(i);
CsvWriter writer = writers.get("AGENT" + agentCounter);
String[] row = new String[instance.numAttributes()];
for (int a = 0; a < instance.numAttributes(); a++) {
row[a] = instance.stringValue(a);
}
writer.writeRecord(row);
agentCounter++;
if (agentCounter == agents) {
agentCounter = 0;
}
}

// Append essentials to all
String fileName = outputDirWithRatio + File.separator + "bayes-central-dataset.csv";
CsvWriter w = new CsvWriter(new FileWriter(fileName, true), ',');
writers.put("CENTRAL", w);
for (String[] essential : essentials) {
for (CsvWriter writer : writers.values()) {
writer.writeRecord(essential);
}
}
for (CsvWriter writer : writers.values()) {
writer.close();
}

} catch (Exception e) {
logger.severe("Exception while splitting dataset. ->");
logger.severe(e.getMessage());
System.exit(1);
}

logger.finest("Dataset for fold " + fold + " created.");
}

logger.finer("<-- splitDataset()");
}

/**
* @param ratio
* @param agents
* @param originalDatasetPath
* @param outputDir
* @param central
* @param scenario
*/
private void createExperimentInfoFile(int folds, int agents, String originalDatasetPath,
String outputDir, String scenario, Logger logger) {

try {
String fileName = outputDir + "/" + agents + "agents/experiment.info";
File file = new File(fileName);
File parent = file.getParentFile();
if (!parent.exists()) {
parent.mkdirs();
}
FileWriter fw = new FileWriter(file);
fw.write("Scenario: " + scenario + "\n");
fw.write("Number of folds: " + Integer.toString(folds) + "\n");
fw.write("Number of Agents: " + Integer.toString(agents) + "\n");
fw.write("Original dataset: " + originalDatasetPath + "\n");
fw.write("Experiment dataset folder: " + outputDir + "\n");
fw.close();

} catch (Exception e) {
logger.severe(e.getMessage());
System.exit(1);
}
}

/**
* @param originalDatasetPath
* @param scenario
* @return
*/
private List<String[]> getEssentials(String originalDatasetPath, Logger logger) {
// Find essentials
List<String[]> essentials = new ArrayList<String[]>();
HashMap<String, List<String>> nodesAndStates = new HashMap<String, List<String>>();
try {
// Look for all possible states
Reader fr = new FileReader(originalDatasetPath);
CsvReader reader = new CsvReader(fr);
reader.readHeaders();
String[] headers = reader.getHeaders();
for (String header : headers) {
nodesAndStates.put(header, new ArrayList<String>());
}
String[] values;
while (reader.readRecord()) {
values = reader.getValues();
for (int i = 0; i < values.length; i++) {
if (!nodesAndStates.get(headers[i]).contains(values[i])) {
nodesAndStates.get(headers[i]).add(values[i]);
if (!essentials.contains(values)) {
essentials.add(values);
}
}
}
}

reader.close();

logger.fine("Number of Essentials: " + essentials.size());
} catch (Exception e) {
logger.severe(e.getMessage());
System.exit(1);
}
return essentials;
}

/**
* @param csvFilePath
* @return
* @throws Exception
*/
private Instances getDataFromCSV(String csvFilePath) throws Exception {
DataSource source = new DataSource(csvFilePath);
Instances data = source.getDataSet();
data.setClassIndex(data.numAttributes() - 1);
return data;
}
}
Loading

0 comments on commit 39d03bc

Please sign in to comment.