From 22e99abc80da86b8132e92e2762bb3a86348ac31 Mon Sep 17 00:00:00 2001 From: "wangdongyu.danny" Date: Fri, 20 Sep 2024 19:20:52 +0800 Subject: [PATCH] Adding non empty check before filling in result Signed-off-by: wangdongyu.danny --- CHANGELOG.md | 1 + .../processor/InferenceProcessor.java | 8 ++- .../TextEmbeddingProcessorTests.java | 51 +++++++++++++++++++ .../resources/processor/IndexMappings.json | 3 ++ src/test/resources/processor/ingest_doc1.json | 3 ++ src/test/resources/processor/ingest_doc2.json | 3 ++ 6 files changed, 68 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5290345db..59e7eb22a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Enhancements - Set neural-search plugin 3.0.0 baseline JDK version to JDK-2 ([#838](https://github.com/opensearch-project/neural-search/pull/838)) ### Bug Fixes +- Fix for nested field missing sub embedding field in text embedding processor ### Infrastructure ### Documentation ### Maintenance diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index 30780a3f5..23d5df60a 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -419,8 +419,14 @@ private void putNLPResultToSourceMapForMapType( for (Map.Entry inputNestedMapEntry : ((Map) sourceValue).entrySet()) { if (sourceAndMetadataMap.get(processorKey) instanceof List) { // build nlp output for list of nested objects + List inputNestedMapValue = (List) inputNestedMapEntry.getValue(); + int valueIndex = 0; for (Map nestedElement : (List>) sourceAndMetadataMap.get(processorKey)) { - nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++)); + // Only fill in when value is not null + if (valueIndex < inputNestedMapValue.size() && inputNestedMapValue.get(valueIndex) != null) { + nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++)); + } + valueIndex++; } } else { Pair processedNestedKey = processNestedKey(inputNestedMapEntry); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java index 82b24324c..f52d69c86 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java @@ -739,6 +739,20 @@ public void testBuildVectorOutput_withNestedList_successful() { assertNotNull(nestedObj.get(1).get("vectorField")); } + public void testBuildVectorOutput_withNestedListHasNotForEmbeddingField_successful() { + Map config = createNestedListConfiguration(); + IngestDocument ingestDocument = createNestedListWithNotEmbeddingFieldIngestDocument(); + TextEmbeddingProcessor textEmbeddingProcessor = createInstanceWithNestedMapConfiguration(config); + Map knnMap = textEmbeddingProcessor.buildMapWithTargetKeys(ingestDocument); + List> modelTensorList = createMockVectorResult(); + textEmbeddingProcessor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + List> nestedObj = (List>) ingestDocument.getSourceAndMetadata().get("nestedField"); + assertFalse(nestedObj.get(0).containsKey("vectorField")); + assertTrue(nestedObj.get(0).containsKey("textFieldNotForEmbedding")); + assertTrue(nestedObj.get(1).containsKey("vectorField")); + assertNotNull(nestedObj.get(1).get("vectorField")); + } + public void testBuildVectorOutput_withNestedList_Level2_successful() { Map config = createNestedList2LevelConfiguration(); IngestDocument ingestDocument = create2LevelNestedListIngestDocument(); @@ -754,6 +768,21 @@ public void testBuildVectorOutput_withNestedList_Level2_successful() { assertNotNull(nestedObj.get(1).get("vectorField")); } + public void testBuildVectorOutput_withNestedListHasNotForEmbeddingField_Level2_successful() { + Map config = createNestedList2LevelConfiguration(); + IngestDocument ingestDocument = create2LevelNestedListWithNotEmbeddingFieldIngestDocument(); + TextEmbeddingProcessor textEmbeddingProcessor = createInstanceWithNestedMapConfiguration(config); + Map knnMap = textEmbeddingProcessor.buildMapWithTargetKeys(ingestDocument); + List> modelTensorList = createMockVectorResult(); + textEmbeddingProcessor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + Map nestedLevel1 = (Map) ingestDocument.getSourceAndMetadata().get("nestedField"); + List> nestedObj = (List>) nestedLevel1.get("nestedField"); + assertFalse(nestedObj.get(0).containsKey("vectorField")); + assertTrue(nestedObj.get(0).containsKey("textFieldNotForEmbedding")); + assertTrue(nestedObj.get(1).containsKey("vectorField")); + assertNotNull(nestedObj.get(1).get("vectorField")); + } + public void test_updateDocument_appendVectorFieldsToDocument_successful() { Map config = createPlainStringConfiguration(); IngestDocument ingestDocument = createPlainIngestDocument(); @@ -1039,6 +1068,16 @@ private IngestDocument createNestedListIngestDocument() { return new IngestDocument(nestedList, new HashMap<>()); } + private IngestDocument createNestedListWithNotEmbeddingFieldIngestDocument() { + HashMap nestedObj1 = new HashMap<>(); + nestedObj1.put("textFieldNotForEmbedding", "This is a text field"); + HashMap nestedObj2 = new HashMap<>(); + nestedObj2.put("textField", "This is another text field"); + HashMap nestedList = new HashMap<>(); + nestedList.put("nestedField", Arrays.asList(nestedObj1, nestedObj2)); + return new IngestDocument(nestedList, new HashMap<>()); + } + private IngestDocument create2LevelNestedListIngestDocument() { HashMap nestedObj1 = new HashMap<>(); nestedObj1.put("textField", "This is a text field"); @@ -1050,4 +1089,16 @@ private IngestDocument create2LevelNestedListIngestDocument() { nestedList1.put("nestedField", nestedList); return new IngestDocument(nestedList1, new HashMap<>()); } + + private IngestDocument create2LevelNestedListWithNotEmbeddingFieldIngestDocument() { + HashMap nestedObj1 = new HashMap<>(); + nestedObj1.put("textFieldNotForEmbedding", "This is a text field"); + HashMap nestedObj2 = new HashMap<>(); + nestedObj2.put("textField", "This is another text field"); + HashMap nestedList = new HashMap<>(); + nestedList.put("nestedField", Arrays.asList(nestedObj1, nestedObj2)); + HashMap nestedList1 = new HashMap<>(); + nestedList1.put("nestedField", nestedList); + return new IngestDocument(nestedList1, new HashMap<>()); + } } diff --git a/src/test/resources/processor/IndexMappings.json b/src/test/resources/processor/IndexMappings.json index 7afbaa92e..afe613117 100644 --- a/src/test/resources/processor/IndexMappings.json +++ b/src/test/resources/processor/IndexMappings.json @@ -90,6 +90,9 @@ "text": { "type": "text" }, + "text_not_for_embedding": { + "type": "text" + }, "embedding": { "type": "knn_vector", "dimension": 768, diff --git a/src/test/resources/processor/ingest_doc1.json b/src/test/resources/processor/ingest_doc1.json index b1cc5392b..d952d07d8 100644 --- a/src/test/resources/processor/ingest_doc1.json +++ b/src/test/resources/processor/ingest_doc1.json @@ -12,6 +12,9 @@ "movie": null }, "nested_passages": [ + { + "text_not_for_embedding": "test" + }, { "text": "hello" }, diff --git a/src/test/resources/processor/ingest_doc2.json b/src/test/resources/processor/ingest_doc2.json index cce93d4a1..5ab1f7525 100644 --- a/src/test/resources/processor/ingest_doc2.json +++ b/src/test/resources/processor/ingest_doc2.json @@ -10,6 +10,9 @@ "movie": null }, "nested_passages": [ + { + "text_not_for_embedding": "test" + }, { "text": "apple" },