Skip to content

Commit

Permalink
validator-core - refactor data normalization to support MultiTable (a…
Browse files Browse the repository at this point in the history
…dd an AutoFeatureType factory), FileConverter : disable legacy FixGML (refs #231)
  • Loading branch information
mborne committed Jul 7, 2021
1 parent d63e797 commit 443f36f
Show file tree
Hide file tree
Showing 13 changed files with 487 additions and 150 deletions.
12 changes: 10 additions & 2 deletions validator-core/src/main/java/fr/ign/validator/Context.java
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,11 @@ public void setOutputProjection(Projection outputProjection) {
* @return
*/
public File getDataDirectory() {
return new File(validationDirectory, getCurrentDirectory().getName() + "/DATA");
File result = new File(validationDirectory, getCurrentDirectory().getName() + "/DATA");
if (!result.exists()) {
result.mkdirs();
}
return result;
}

/**
Expand All @@ -640,7 +644,11 @@ public File getDataDirectory() {
* @return
*/
public File getMetadataDirectory() {
return new File(validationDirectory, getCurrentDirectory().getName() + "/METADATA");
File result = new File(validationDirectory, getCurrentDirectory().getName() + "/METADATA");
if (!result.exists()) {
result.mkdirs();
}
return result;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.List;

import org.apache.logging.log4j.LogManager;
Expand All @@ -25,8 +26,10 @@
import fr.ign.validator.metadata.gmd.MetadataISO19115;
import fr.ign.validator.model.FileModel;
import fr.ign.validator.model.file.MetadataModel;
import fr.ign.validator.model.file.MultiTableModel;
import fr.ign.validator.model.file.TableModel;
import fr.ign.validator.tools.EnvelopeUtils;
import fr.ign.validator.tools.FileUtils;
import fr.ign.validator.tools.TableReader;

/**
Expand All @@ -41,6 +44,17 @@ public class DocumentInfoExtractor {
public static final Logger log = LogManager.getRootLogger();
public static final Marker MARKER = MarkerManager.getMarker("DocumentInfoExtractor");

/**
* Stats about a given table.
*
* @author mborne
*
*/
private class TableStats {
Envelope boundingBox = new Envelope();
int totalFeatures = 0;
}

/**
* Gets informations on directory
*
Expand Down Expand Up @@ -82,46 +96,94 @@ private void parseDocumentFiles(Context context, Document document, DocumentInfo
documentFileInfo.setName(documentFile.getPath().getName());
documentFileInfo.setPath(context.relativize(documentFile.getPath()));
if (fileModel instanceof TableModel) {
parseTable(context, fileModel, documentFileInfo);
parseTable(context, (TableModel) fileModel, documentFileInfo);
} else if (fileModel instanceof MultiTableModel) {
parseTables(context, (MultiTableModel) fileModel, documentFileInfo);
}
documentInfo.addFile(documentFileInfo);
}
}

/**
* Retreive boundingBox and featureCount from normalized file
* Retrieve boundingBox and featureCount from normalized file
*
* @param context
* @param fileModel
* @param documentFileInfo
*/
private void parseTable(Context context, FileModel fileModel, DocumentFileInfo documentFileInfo) {
private void parseTable(Context context, TableModel fileModel, DocumentFileInfo documentFileInfo) {
File csvFile = new File(context.getDataDirectory(), fileModel.getName() + ".csv");
TableStats stats = getTableStatsFromNormalizedCSV(csvFile);
if (stats != null) {
documentFileInfo.setTotalFeatures(stats.totalFeatures);
documentFileInfo.setBoundingBox(stats.boundingBox);
}
}

Envelope boundingBox = new Envelope();
int totalFeatures = 0;
/**
* Retrieve boundingBox and featureCount from normalized file
*
* @param context
* @param fileModel
* @param documentFileInfo
*/
private void parseTables(Context context, MultiTableModel fileModel, DocumentFileInfo documentFileInfo) {
File csvDirectory = new File(context.getDataDirectory(), fileModel.getName());

// stats for all tables
TableStats result = new TableStats();

String[] extensions = {
"csv"
};
Collection<File> csvFiles = FileUtils.listFilesAndDirs(csvDirectory, extensions);
if (csvFiles.isEmpty()) {
log.warn(MARKER, "normalized CSV files for {} not found", fileModel.getName());
return;
}

for (File csvFile : csvFiles) {
TableStats tableStats = getTableStatsFromNormalizedCSV(csvFile);
if (tableStats == null) {
continue;
}
// TODO save stats for each table in multi_table.
result.totalFeatures += tableStats.totalFeatures;
result.boundingBox.expandToInclude(tableStats.boundingBox);
}

documentFileInfo.setTotalFeatures(result.totalFeatures);
documentFileInfo.setBoundingBox(result.boundingBox);
}

/**
* Get {@link TableStats} from a normalized CSV file.
*
* @param csvFile
* @return
*/
private TableStats getTableStatsFromNormalizedCSV(File csvFile) {
TableStats result = new TableStats();
try {
TableReader reader = TableReader.createTableReader(csvFile, StandardCharsets.UTF_8);
// retreive geometry column

int indexWktColumn = reader.findColumn("WKT");
while (reader.hasNext()) {
String[] row = reader.next();
// count features
totalFeatures++;
result.totalFeatures++;
// compute bounding box
if (indexWktColumn >= 0) {
String wkt = row[indexWktColumn];
boundingBox.expandToInclude(EnvelopeUtils.getEnvelope(wkt));
result.boundingBox.expandToInclude(EnvelopeUtils.getEnvelope(wkt));
}
}

} catch (IOException e) {
log.error(MARKER, "Fail to extract infos from " + fileModel.getName() + ".csv");
return;
log.error(MARKER, "fail to compute stats for {}", csvFile);
return null;
}

documentFileInfo.setTotalFeatures(totalFeatures);
documentFileInfo.setBoundingBox(boundingBox);
return result;
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package fr.ign.validator.normalize;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.Marker;
import org.apache.logging.log4j.MarkerManager;

import fr.ign.validator.Context;
import fr.ign.validator.data.Document;
import fr.ign.validator.data.DocumentFile;
import fr.ign.validator.model.FeatureType;
import fr.ign.validator.model.FileModel;
import fr.ign.validator.model.file.MetadataModel;
import fr.ign.validator.model.file.MultiTableModel;
import fr.ign.validator.model.file.PdfModel;
import fr.ign.validator.model.file.TableModel;
import fr.ign.validator.tools.AutoFeatureType;
import fr.ign.validator.tools.MultiTableReader;

/**
* Creates DATA and METADATA directories in the validation directory :
*
* <ul>
* <li>Tables are normalized according to FeatureType as
* DATA/{fileModel.name}.csv</li>
* <li>Tables are normalized according to FeatureType as
* DATA/{fileModel.name}/{tableName}.csv</li>
* <li>PDF are copied to DATA directory</li>
* <li>Metadata are copied to METADATA directory</li>
* <li>Directories are ignored</li>
* </ul>
*
* Note that DATA and METADATA corresponds to the structure of an EaaS delivery
* (former geoportal datastore).
*
* @author MBorne
*
*/
public class DocumentNormalizer {
public static final Logger log = LogManager.getRootLogger();
public static final Marker MARKER = MarkerManager.getMarker("NormalizePostProcess");

/**
* Normalize document files.
*
* @param context
*/
public void normalize(Context context, Document document) throws IOException {
log.info(MARKER, "Create normalized files in {} ...", context.getDataDirectory());

/*
* Create a normalized CSV file for each FileModel.
*/
List<FileModel> fileModels = document.getDocumentModel().getFileModels();
for (FileModel fileModel : fileModels) {
// Retrieve document files corresponding to the FileModel
List<DocumentFile> documentFiles = document.getDocumentFilesByModel(fileModel);

if (fileModel instanceof TableModel) {
normalizeTable(context, (TableModel) fileModel, documentFiles);
} else if (fileModel instanceof MultiTableModel) {
normalizeMultiTable(context, (MultiTableModel) fileModel, documentFiles);
} else if (fileModel instanceof PdfModel) {
createFlatCopyInTargetDirectory(fileModel, documentFiles, context.getDataDirectory());
} else if (fileModel instanceof MetadataModel) {
createFlatCopyInTargetDirectory(fileModel, documentFiles, context.getMetadataDirectory());
}
}

log.info(MARKER, "Create normalized files in {} : completed.", context.getDataDirectory());
}

/**
* Convert documentFiles in a normalized DATA/{fileModel.name}.csv file.
*
* @param context
* @param fileModel
* @param documentFiles
* @throws IOException
*/
private void normalizeTable(Context context, TableModel fileModel, List<DocumentFile> documentFiles)
throws IOException {
FeatureType featureType = fileModel.getFeatureType();
if (featureType == null) {
log.warn(MARKER, "Skip {} (no FeatureType provided)", fileModel.getName());
return;
}

File csvFile = new File(context.getDataDirectory(), fileModel.getName() + ".csv");
log.warn(MARKER, "Create {} (no FeatureType provided)", fileModel.getName());
TableNormalizer normalizer = new TableNormalizer(context, featureType, csvFile);
for (DocumentFile documentFile : documentFiles) {
log.info(MARKER, "Append {} to CSV file {}...", documentFile.getPath(), csvFile);
normalizer.append(documentFile.getPath());
}
normalizer.close();
}

/**
* Convert documentFiles in a normalized DATA/{fileModel.name}.{tableName}.csv
* file.
*
* @param context
* @param fileModel
* @param documentFiles
* @throws IOException
*/
private void normalizeMultiTable(Context context, MultiTableModel fileModel, List<DocumentFile> documentFiles)
throws IOException {
if (documentFiles.isEmpty() && documentFiles.size() > 1) {
log.warn(
MARKER, "{} - skipped (found {} files, normalization not supported for MultiTable)",
fileModel.getName(),
documentFiles.size()
);
return;
}
DocumentFile documentFile = documentFiles.get(0);
MultiTableReader reader = MultiTableReader.createMultiTableReader(documentFile.getPath());
for (String tableName : reader.getTableNames()) {
/*
* Retrieve source path for CSV converted table.
*/
File sourceFile = reader.getTablePath(tableName);
/*
* Detected FeatureType from CSV.
*
* TODO allow user to provide featureTypes.
*/
FeatureType featureType = AutoFeatureType.createFeatureTypeFromTable(sourceFile);

/*
* Prepare output directory for the FileModel DATA/{fileModel.name}
*/
File outputDir = new File(context.getDataDirectory(), fileModel.getName());
if (!outputDir.exists()) {
outputDir.mkdirs();
}
/*
* Create normalized CSV file.
*/
File outputFile = new File(outputDir, tableName + ".csv");
TableNormalizer normalizer = new TableNormalizer(context, featureType, outputFile);
normalizer.append(sourceFile);
normalizer.close();
}
}

/**
* Copy files to targetDirectory without original hierarchy.
*
* @param documentFiles
* @param targetDirectory
* @throws IOException
*/
private void createFlatCopyInTargetDirectory(
FileModel fileModel,
List<DocumentFile> documentFiles,
File targetDirectory) throws IOException {

log.warn(MARKER, "{} - Copy {} files to {} ...", fileModel.getName(), fileModel.getType(), targetDirectory);
for (DocumentFile documentFile : documentFiles) {
File srcFile = documentFile.getPath();
File destFile = new File(targetDirectory, srcFile.getName());
log.info(MARKER, "Copy {} to {}...", srcFile, destFile);
FileUtils.copyFile(srcFile, destFile);
}
}

}
Loading

0 comments on commit 443f36f

Please sign in to comment.