feat: add ignore missing field to text chunking processor

Signed-off-by: Ian Menendez <[email protected]>
opensearch-project · Sep 13, 2024 · ffcd3a9 · ffcd3a9
1 parent f58d989
commit ffcd3a9
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 6 deletions.
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
@@ -43,8 +43,8 @@ One easy way to get Java 11 on *nix is to use [sdkman](https://sdkman.io/).
 ```bash
 curl -s "https://get.sdkman.io" | bash
 source ~/.sdkman/bin/sdkman-init.sh
-sdk install java 11.0.2-open
-sdk use java 11.0.2-open
+sdk install java 21.0.2-open
+sdk use java 21.0.2-open
 ```
 
 JDK versions 14 and 17 were tested and are fully supported for local development.

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java
@@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
     public static final String FIELD_MAP_FIELD = "field_map";
     public static final String ALGORITHM_FIELD = "algorithm";
     private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;
+    public static final String IGNORE_MISSING = "ignore_missing";
+    public static final Boolean DEFAULT_IGNORE_MISSING = false;
 
     private int maxChunkLimit;
     private Chunker chunker;
     private final Map<String, Object> fieldMap;
+    private final Boolean ignoreMissing;
     private final ClusterService clusterService;
     private final AnalysisRegistry analysisRegistry;
     private final Environment environment;
@@ -59,12 +62,14 @@ public TextChunkingProcessor(
         final String description,
         final Map<String, Object> fieldMap,
         final Map<String, Object> algorithmMap,
+        final boolean ignoreMissing,
         final Environment environment,
         final ClusterService clusterService,
         final AnalysisRegistry analysisRegistry
     ) {
         super(tag, description);
         this.fieldMap = fieldMap;
+        this.ignoreMissing = ignoreMissing;
         this.environment = environment;
         this.clusterService = clusterService;
         this.analysisRegistry = analysisRegistry;
@@ -250,8 +255,11 @@ private void chunkMapType(
             } else {
                 // chunk the object when target key is of leaf type (null, string and list of string)
                 Object chunkObject = sourceAndMetadataMap.get(originalKey);
-                List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
-                sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
+
+                if (!(ignoreMissing && chunkObject == null)) {
+                    List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
+                    sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
+                }
             }
         }
     }

diff --git a/...main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java b/...main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java
@@ -14,7 +14,10 @@
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
+import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
+import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING;
 import static org.opensearch.ingest.ConfigurationUtils.readMap;
+import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;
 
 /**
  * Factory for chunking ingest processor for ingestion pipeline.
@@ -45,6 +48,16 @@ public TextChunkingProcessor create(
     ) throws Exception {
         Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
         Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
-        return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry);
+        boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING);
+        return new TextChunkingProcessor(
+            processorTag,
+            description,
+            fieldMap,
+            algorithmMap,
+            ignoreMissing,
+            environment,
+            clusterService,
+            analysisRegistry
+        );
     }
 }
diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorIT.java
@@ -48,6 +48,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
     private static final String TEST_DOCUMENT = "processor/chunker/TextChunkingTestDocument.json";
 
     private static final String TEST_LONG_DOCUMENT = "processor/chunker/TextChunkingTestLongDocument.json";
+    private static final String TEST_DOCUMENT_NO_BODY = "processor/chunker/TextChunkingTestDocumentNoBody.json";
+
+    private static final String IGNORE_MISSING_PIPELINE_NAME = "pipeline-with-ignore-missing";
 
     private static final Map<String, String> PIPELINE_CONFIGS_BY_NAME = Map.of(
         FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME,
@@ -59,7 +62,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
         DELIMITER_PIPELINE_NAME,
         "processor/chunker/PipelineForDelimiterChunker.json",
         CASCADE_PIPELINE_NAME,
-        "processor/chunker/PipelineForCascadedChunker.json"
+        "processor/chunker/PipelineForCascadedChunker.json",
+        IGNORE_MISSING_PIPELINE_NAME,
+        "processor/chunker/PipelineWithIgnoreMissing.json"
     );
 
     @Before
@@ -176,6 +181,36 @@ public void testTextChunkingProcessor_withCascadePipeline_successful() {
         }
     }
 
+    @SneakyThrows
+    public void testTextChunkingProcessor_withIgnoreMissing() {
+        try {
+            createPipelineProcessor(IGNORE_MISSING_PIPELINE_NAME);
+            createTextChunkingIndex(INDEX_NAME, IGNORE_MISSING_PIPELINE_NAME);
+            ingestDocument(TEST_DOCUMENT_NO_BODY);
+
+            validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, null);
+
+            validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, null);
+        } finally {
+            wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
+        }
+    }
+
+    @SneakyThrows
+    public void testTextChunkingProcessor_withoutIgnoreMissing() {
+        try {
+            createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
+            createTextChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME);
+            ingestDocument(TEST_DOCUMENT_NO_BODY);
+
+            List<String> expectedPassages = new ArrayList<>();
+            validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
+            validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, expectedPassages);
+        } finally {
+            wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
+        }
+    }
+
     private void validateIndexIngestResults(String indexName, String fieldName, Object expected) {
         assertEquals(1, getDocCount(indexName));
         MatchAllQueryBuilder query = new MatchAllQueryBuilder();

diff --git a/src/test/resources/processor/chunker/PipelineWithIgnoreMissing.json b/src/test/resources/processor/chunker/PipelineWithIgnoreMissing.json
@@ -0,0 +1,19 @@
+{
+  "description": "An example fixed token length chunker pipeline with ignore missing == true",
+  "processors" : [
+    {
+      "text_chunking": {
+        "ignore_missing": true,
+        "field_map": {
+          "body": "body_chunk"
+        },
+        "algorithm": {
+          "fixed_token_length": {
+            "token_limit": 10,
+            "tokenizer": "letter"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/src/test/resources/processor/chunker/TextChunkingTestDocumentNoBody.json b/src/test/resources/processor/chunker/TextChunkingTestDocumentNoBody.json
@@ -0,0 +1,3 @@
+{
+  "name": "OpenSearch"
+}