Skip to content

Commit

Permalink
feat: add ignore missing field to text chunking processor
Browse files Browse the repository at this point in the history
Signed-off-by: Ian Menendez <[email protected]>
  • Loading branch information
IanMenendez committed Sep 13, 2024
1 parent f58d989 commit ffcd3a9
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 6 deletions.
4 changes: 2 additions & 2 deletions DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ One easy way to get Java 11 on *nix is to use [sdkman](https://sdkman.io/).
```bash
curl -s "https://get.sdkman.io" | bash
source ~/.sdkman/bin/sdkman-init.sh
sdk install java 11.0.2-open
sdk use java 11.0.2-open
sdk install java 21.0.2-open
sdk use java 21.0.2-open
```

JDK versions 14 and 17 were tested and are fully supported for local development.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
public static final String FIELD_MAP_FIELD = "field_map";
public static final String ALGORITHM_FIELD = "algorithm";
private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;
public static final String IGNORE_MISSING = "ignore_missing";
public static final Boolean DEFAULT_IGNORE_MISSING = false;

private int maxChunkLimit;
private Chunker chunker;
private final Map<String, Object> fieldMap;
private final Boolean ignoreMissing;
private final ClusterService clusterService;
private final AnalysisRegistry analysisRegistry;
private final Environment environment;
Expand All @@ -59,12 +62,14 @@ public TextChunkingProcessor(
final String description,
final Map<String, Object> fieldMap,
final Map<String, Object> algorithmMap,
final boolean ignoreMissing,
final Environment environment,
final ClusterService clusterService,
final AnalysisRegistry analysisRegistry
) {
super(tag, description);
this.fieldMap = fieldMap;
this.ignoreMissing = ignoreMissing;
this.environment = environment;
this.clusterService = clusterService;
this.analysisRegistry = analysisRegistry;
Expand Down Expand Up @@ -250,8 +255,11 @@ private void chunkMapType(
} else {
// chunk the object when target key is of leaf type (null, string and list of string)
Object chunkObject = sourceAndMetadataMap.get(originalKey);
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);

if (!(ignoreMissing && chunkObject == null)) {
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING;
import static org.opensearch.ingest.ConfigurationUtils.readMap;
import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;

/**
* Factory for chunking ingest processor for ingestion pipeline.
Expand Down Expand Up @@ -45,6 +48,16 @@ public TextChunkingProcessor create(
) throws Exception {
Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry);
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING);
return new TextChunkingProcessor(
processorTag,
description,
fieldMap,
algorithmMap,
ignoreMissing,
environment,
clusterService,
analysisRegistry
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String TEST_DOCUMENT = "processor/chunker/TextChunkingTestDocument.json";

private static final String TEST_LONG_DOCUMENT = "processor/chunker/TextChunkingTestLongDocument.json";
private static final String TEST_DOCUMENT_NO_BODY = "processor/chunker/TextChunkingTestDocumentNoBody.json";

private static final String IGNORE_MISSING_PIPELINE_NAME = "pipeline-with-ignore-missing";

private static final Map<String, String> PIPELINE_CONFIGS_BY_NAME = Map.of(
FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME,
Expand All @@ -59,7 +62,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
DELIMITER_PIPELINE_NAME,
"processor/chunker/PipelineForDelimiterChunker.json",
CASCADE_PIPELINE_NAME,
"processor/chunker/PipelineForCascadedChunker.json"
"processor/chunker/PipelineForCascadedChunker.json",
IGNORE_MISSING_PIPELINE_NAME,
"processor/chunker/PipelineWithIgnoreMissing.json"
);

@Before
Expand Down Expand Up @@ -176,6 +181,36 @@ public void testTextChunkingProcessor_withCascadePipeline_successful() {
}
}

@SneakyThrows
public void testTextChunkingProcessor_withIgnoreMissing() {
try {
createPipelineProcessor(IGNORE_MISSING_PIPELINE_NAME);
createTextChunkingIndex(INDEX_NAME, IGNORE_MISSING_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT_NO_BODY);

validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, null);

validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, null);
} finally {
wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
}
}

@SneakyThrows
public void testTextChunkingProcessor_withoutIgnoreMissing() {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createTextChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT_NO_BODY);

List<String> expectedPassages = new ArrayList<>();
validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, expectedPassages);
} finally {
wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
}
}

private void validateIndexIngestResults(String indexName, String fieldName, Object expected) {
assertEquals(1, getDocCount(indexName));
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"description": "An example fixed token length chunker pipeline with ignore missing == true",
"processors" : [
{
"text_chunking": {
"ignore_missing": true,
"field_map": {
"body": "body_chunk"
},
"algorithm": {
"fixed_token_length": {
"token_limit": 10,
"tokenizer": "letter"
}
}
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"name": "OpenSearch"
}

0 comments on commit ffcd3a9

Please sign in to comment.