From c3e458696cfe41088741b0b5c5aeb58de8ec32be Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 09:00:44 -0700 Subject: [PATCH] Onboard neural sparse search (#141) (#142) Signed-off-by: Tyler Ohlsen (cherry picked from commit 4d5f50c21e47264b1263daad20b09c592f87961a) Co-authored-by: Tyler Ohlsen --- common/constants.ts | 35 +++- common/interfaces.ts | 26 ++- public/component_types/index.ts | 1 + public/component_types/indexer/indexer.ts | 6 +- public/component_types/indexer/knn_indexer.ts | 2 +- public/component_types/other/document.tsx | 30 +++ public/component_types/other/index.ts | 6 + public/component_types/transformer/index.ts | 1 + .../transformer/sparse_encoder_transformer.ts | 64 +++++++ .../transformer/text_embedding_transformer.ts | 23 ++- .../component_details/component_inputs.tsx | 9 +- .../input_fields/model_field.tsx | 23 +++ .../workflow_detail/prototype/ingestor.tsx | 32 ++-- .../prototype/query_executor.tsx | 53 ++++-- .../pages/workflow_detail/prototype/utils.ts | 15 +- .../utils/data_extractor_utils.ts | 4 +- .../utils/workflow_to_template_utils.ts | 173 +++++++++++++----- .../workspace/resizable_workspace.tsx | 2 +- public/pages/workflows/new_workflow/utils.ts | 169 +++++++++++++++-- .../pages/workflows/workflow_list/columns.tsx | 2 + public/utils/constants.ts | 3 + .../templates/neural_sparse_search.json | 12 ++ .../resources/templates/semantic_search.json | 4 +- 23 files changed, 576 insertions(+), 119 deletions(-) create mode 100644 public/component_types/other/document.tsx create mode 100644 public/component_types/other/index.ts create mode 100644 public/component_types/transformer/sparse_encoder_transformer.ts create mode 100644 server/resources/templates/neural_sparse_search.json diff --git a/common/constants.ts b/common/constants.ts index 7f8a507e..4a9ab001 100644 --- a/common/constants.ts +++ b/common/constants.ts @@ -7,6 +7,7 @@ import { MODEL_ALGORITHM, PRETRAINED_MODEL_FORMAT, PretrainedSentenceTransformer, + PretrainedSparseEncodingModel, WORKFLOW_STATE, } from './interfaces'; @@ -61,11 +62,15 @@ export const CREATE_INGEST_PIPELINE_STEP_TYPE = 'create_ingest_pipeline'; export const CREATE_INDEX_STEP_TYPE = 'create_index'; export const REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE = 'register_local_pretrained_model'; +export const REGISTER_LOCAL_SPARSE_ENCODING_MODEL_STEP_TYPE = + 'register_local_sparse_encoding_model'; /** * ML PLUGIN PRETRAINED MODELS - * (based off of https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/#sentence-transformers) + * (based off of https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models) */ + +// ---- SENTENCE TRANSFORMERS ---- export const ROBERTA_SENTENCE_TRANSFORMER = { name: 'huggingface/sentence-transformers/all-distilroberta-v1', shortenedName: 'all-distilroberta-v1', @@ -96,6 +101,34 @@ export const BERT_SENTENCE_TRANSFORMER = { vectorDimensions: 768, } as PretrainedSentenceTransformer; +// ---- SPARSE ENCODERS ---- +export const NEURAL_SPARSE_TRANSFORMER = { + name: 'amazon/neural-sparse/opensearch-neural-sparse-encoding-v1', + shortenedName: 'opensearch-neural-sparse-encoding-v1', + description: 'A general neural sparse encoding model', + format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT, + algorithm: MODEL_ALGORITHM.SPARSE_ENCODING, + version: '1.0.1', +} as PretrainedSparseEncodingModel; + +export const NEURAL_SPARSE_DOC_TRANSFORMER = { + name: 'amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1', + shortenedName: 'opensearch-neural-sparse-encoding-doc-v1', + description: 'A general neural sparse encoding model', + format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT, + algorithm: MODEL_ALGORITHM.SPARSE_ENCODING, + version: '1.0.1', +} as PretrainedSparseEncodingModel; + +export const NEURAL_SPARSE_TOKENIZER_TRANSFORMER = { + name: 'amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1', + shortenedName: 'opensearch-neural-sparse-tokenizer-v1', + description: 'A neural sparse tokenizer model', + format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT, + algorithm: MODEL_ALGORITHM.SPARSE_ENCODING, + version: '1.0.1', +} as PretrainedSparseEncodingModel; + /** * MISCELLANEOUS */ diff --git a/common/interfaces.ts b/common/interfaces.ts index 858965cf..a2686d6e 100644 --- a/common/interfaces.ts +++ b/common/interfaces.ts @@ -5,6 +5,7 @@ import { Node, Edge } from 'reactflow'; import { IComponentData } from '../public/component_types'; +import { COMPONENT_CLASS } from '../public/utils'; export type Index = { name: string; @@ -16,7 +17,11 @@ export type Index = { */ export type ReactFlowComponent = Node; -export type ReactFlowEdge = Edge<{}> & {}; +export type ReactFlowEdge = Edge<{}> & { + key: string; + sourceClasses: COMPONENT_CLASS[]; + targetClasses: COMPONENT_CLASS[]; +}; type ReactFlowViewport = { x: number; @@ -49,6 +54,22 @@ export type TextEmbeddingProcessor = IngestProcessor & { }; }; +export type SparseEncodingProcessor = IngestProcessor & { + sparse_encoding: { + model_id: string; + field_map: {}; + }; +}; + +export type IndexConfiguration = { + settings: {}; + mappings: IndexMappings; +}; + +export type IndexMappings = { + properties: {}; +}; + export type TemplateNode = { id: string; type: string; @@ -135,6 +156,7 @@ export type Workflow = WorkflowTemplate & { export enum USE_CASE { SEMANTIC_SEARCH = 'SEMANTIC_SEARCH', + NEURAL_SPARSE_SEARCH = 'NEURAL_SPARSE_SEARCH', } /** @@ -196,6 +218,8 @@ export type PretrainedSentenceTransformer = PretrainedModel & { vectorDimensions: number; }; +export type PretrainedSparseEncodingModel = PretrainedModel & {}; + export type ModelConfig = { modelType?: string; embeddingDimension?: number; diff --git a/public/component_types/index.ts b/public/component_types/index.ts index f21f6aec..e713518e 100644 --- a/public/component_types/index.ts +++ b/public/component_types/index.ts @@ -6,3 +6,4 @@ export * from './interfaces'; export * from './transformer'; export * from './indexer'; +export * from './other'; diff --git a/public/component_types/indexer/indexer.ts b/public/component_types/indexer/indexer.ts index 0ee68b78..25959e02 100644 --- a/public/component_types/indexer/indexer.ts +++ b/public/component_types/indexer/indexer.ts @@ -20,9 +20,9 @@ export class Indexer extends BaseComponent { this.baseClasses = [this.type]; this.inputs = [ { - id: 'transformer', - label: 'Transformer', - baseClass: COMPONENT_CLASS.TRANSFORMER, + id: 'document', + label: 'Document', + baseClass: COMPONENT_CLASS.DOCUMENT, acceptMultiple: false, }, ]; diff --git a/public/component_types/indexer/knn_indexer.ts b/public/component_types/indexer/knn_indexer.ts index d0eba67d..fccef524 100644 --- a/public/component_types/indexer/knn_indexer.ts +++ b/public/component_types/indexer/knn_indexer.ts @@ -13,7 +13,7 @@ export class KnnIndexer extends Indexer { constructor() { super(); this.type = COMPONENT_CLASS.KNN_INDEXER; - this.label = 'K-NN Indexer'; + this.label = 'K-NN Index'; this.description = 'A specialized indexer for K-NN indices'; this.baseClasses = [...this.baseClasses, this.type]; this.createFields = [ diff --git a/public/component_types/other/document.tsx b/public/component_types/other/document.tsx new file mode 100644 index 00000000..2552b8d2 --- /dev/null +++ b/public/component_types/other/document.tsx @@ -0,0 +1,30 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../utils'; +import { BaseComponent } from '../base_component'; + +/** + * A basic Document placeholder UI component. + * Does not have any functionality. + */ +export class Document extends BaseComponent { + constructor() { + super(); + this.type = COMPONENT_CLASS.DOCUMENT; + this.label = 'Document'; + this.description = 'A document to be ingested'; + this.categories = [COMPONENT_CATEGORY.INGEST]; + this.allowsCreation = false; + this.baseClasses = [this.type]; + this.inputs = []; + this.outputs = [ + { + label: this.label, + baseClasses: this.baseClasses, + }, + ]; + } +} diff --git a/public/component_types/other/index.ts b/public/component_types/other/index.ts new file mode 100644 index 00000000..3441a8ed --- /dev/null +++ b/public/component_types/other/index.ts @@ -0,0 +1,6 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +export * from './document'; diff --git a/public/component_types/transformer/index.ts b/public/component_types/transformer/index.ts index 5fe06d42..740503a0 100644 --- a/public/component_types/transformer/index.ts +++ b/public/component_types/transformer/index.ts @@ -5,3 +5,4 @@ export * from './ml_transformer'; export * from './text_embedding_transformer'; +export * from './sparse_encoder_transformer'; diff --git a/public/component_types/transformer/sparse_encoder_transformer.ts b/public/component_types/transformer/sparse_encoder_transformer.ts new file mode 100644 index 00000000..01ad8a92 --- /dev/null +++ b/public/component_types/transformer/sparse_encoder_transformer.ts @@ -0,0 +1,64 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../../common'; +import { MLTransformer } from '.'; + +/** + * A specialized sparse encoder ML transformer UI component + */ +export class SparseEncoderTransformer extends MLTransformer { + constructor() { + super(); + this.type = COMPONENT_CLASS.SPARSE_ENCODER_TRANSFORMER; + this.label = 'Sparse Encoder'; + this.description = + 'A specialized ML transformer to perform sparse encoding'; + this.categories = [COMPONENT_CATEGORY.INGEST]; + this.baseClasses = [...this.baseClasses, this.type]; + this.inputs = [ + { + id: 'document', + label: 'Document', + baseClass: COMPONENT_CLASS.DOCUMENT, + acceptMultiple: false, + }, + ]; + this.createFields = [ + { + label: 'Sparse Encoding Model', + id: 'model', + type: 'model', + helpText: + 'A sparse encoding model to be used for generating sparse vectors.', + helpLink: + 'https://opensearch.org/docs/latest/ml-commons-plugin/integrating-ml-models/#choosing-a-model', + }, + { + label: 'Input Field', + id: 'inputField', + type: 'string', + helpText: + 'The name of the document field from which to obtain text for generating sparse embeddings.', + helpLink: + 'https://opensearch.org/docs/latest/ingest-pipelines/processors/sparse-encoding/#configuration-parameters', + }, + { + label: 'Vector Field', + id: 'vectorField', + type: 'string', + helpText: `The name of the document's vector field in which to store the generated sparse embeddings.`, + helpLink: + 'https://opensearch.org/docs/latest/ingest-pipelines/processors/sparse-encoding/#configuration-parameters', + }, + ]; + this.outputs = [ + { + label: 'Transformed Document', + baseClasses: [COMPONENT_CLASS.DOCUMENT], + }, + ]; + } +} diff --git a/public/component_types/transformer/text_embedding_transformer.ts b/public/component_types/transformer/text_embedding_transformer.ts index affb996c..bf05674f 100644 --- a/public/component_types/transformer/text_embedding_transformer.ts +++ b/public/component_types/transformer/text_embedding_transformer.ts @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { COMPONENT_CLASS } from '../../../common'; +import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../../common'; import { MLTransformer } from '.'; /** @@ -13,10 +13,18 @@ export class TextEmbeddingTransformer extends MLTransformer { constructor() { super(); this.type = COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER; - this.label = 'Text Embedding Transformer'; + this.label = 'Text Embedder'; this.description = 'A specialized ML transformer for embedding text'; + this.categories = [COMPONENT_CATEGORY.INGEST]; this.baseClasses = [...this.baseClasses, this.type]; - this.inputs = []; + this.inputs = [ + { + id: 'document', + label: 'Document', + baseClass: COMPONENT_CLASS.DOCUMENT, + acceptMultiple: false, + }, + ]; this.createFields = [ { label: 'Text Embedding Model', @@ -31,7 +39,7 @@ export class TextEmbeddingTransformer extends MLTransformer { id: 'inputField', type: 'string', helpText: - 'The name of the field from which to obtain text for generating text embeddings.', + 'The name of the document field from which to obtain text for generating text embeddings.', helpLink: 'https://opensearch.org/docs/latest/ingest-pipelines/processors/text-embedding/', }, @@ -39,16 +47,15 @@ export class TextEmbeddingTransformer extends MLTransformer { label: 'Vector Field', id: 'vectorField', type: 'string', - helpText: - ' The name of the vector field in which to store the generated text embeddings.', + helpText: `The name of the document's vector field in which to store the generated text embeddings.`, helpLink: 'https://opensearch.org/docs/latest/ingest-pipelines/processors/text-embedding/', }, ]; this.outputs = [ { - label: this.label, - baseClasses: this.baseClasses, + label: 'Transformed Document', + baseClasses: [COMPONENT_CLASS.DOCUMENT], }, ]; } diff --git a/public/pages/workflow_detail/component_details/component_inputs.tsx b/public/pages/workflow_detail/component_details/component_inputs.tsx index 2786b6a8..262bb547 100644 --- a/public/pages/workflow_detail/component_details/component_inputs.tsx +++ b/public/pages/workflow_detail/component_details/component_inputs.tsx @@ -4,7 +4,7 @@ */ import React, { useState } from 'react'; -import { EuiSpacer, EuiText, EuiTitle } from '@elastic/eui'; +import { EuiHorizontalRule, EuiSpacer, EuiText, EuiTitle } from '@elastic/eui'; import { InputFieldList } from './input_field_list'; import { NODE_CATEGORY, ReactFlowComponent } from '../../../../common'; import { NewOrExistingTabs } from '../workspace/workspace_components/new_or_existing_tabs'; @@ -58,11 +58,12 @@ export function ComponentInputs(props: ComponentInputsProps) { {props.selectedComponent.data.description} - - + /> */} + { if (selectedRadioId !== undefined) { + // TODO: add fine-grained filtering so only relevant pretrained and existing models + // are visible based on the use case if (selectedRadioId === MODEL_CATEGORY.DEPLOYED) { setSelectableModels(deployedModels); } else { diff --git a/public/pages/workflow_detail/prototype/ingestor.tsx b/public/pages/workflow_detail/prototype/ingestor.tsx index dee9be85..00cbedac 100644 --- a/public/pages/workflow_detail/prototype/ingestor.tsx +++ b/public/pages/workflow_detail/prototype/ingestor.tsx @@ -16,28 +16,24 @@ import { USE_CASE, Workflow, getIndexName, - getSemanticSearchValues, + getNeuralSearchValues, } from '../../../../common'; import { ingest, useAppDispatch } from '../../../store'; import { getCore } from '../../../services'; -import { getFormattedJSONString } from './utils'; +import { + NeuralSparseValues, + SemanticSearchValues, + WorkflowValues, + getFormattedJSONString, +} from './utils'; interface IngestorProps { workflow: Workflow; } -type WorkflowValues = { - modelId: string; -}; - -type SemanticSearchValues = WorkflowValues & { - inputField: string; - vectorField: string; -}; - type DocGeneratorFn = ( queryText: string, - workflowValues: SemanticSearchValues + workflowValues: SemanticSearchValues | NeuralSparseValues ) => {}; /** @@ -188,8 +184,9 @@ function getDocGeneratorFn(workflow: Workflow): DocGeneratorFn { let fn; switch (workflow.use_case) { case USE_CASE.SEMANTIC_SEARCH: + case USE_CASE.NEURAL_SPARSE_SEARCH: default: { - fn = () => generateSemanticSearchDoc; + fn = () => generateNeuralSearchDoc; } } return fn; @@ -200,17 +197,18 @@ function getWorkflowValues(workflow: Workflow): WorkflowValues { let values; switch (workflow.use_case) { case USE_CASE.SEMANTIC_SEARCH: + case USE_CASE.NEURAL_SPARSE_SEARCH: default: { - values = getSemanticSearchValues(workflow); + values = getNeuralSearchValues(workflow); } } return values; } -// utility fn to generate a document suited for semantic search -function generateSemanticSearchDoc( +// utility fn to generate a document suited for neural search use cases +function generateNeuralSearchDoc( docValue: string, - workflowValues: SemanticSearchValues + workflowValues: SemanticSearchValues | NeuralSparseValues ): {} { return { [workflowValues.inputField]: docValue, diff --git a/public/pages/workflow_detail/prototype/query_executor.tsx b/public/pages/workflow_detail/prototype/query_executor.tsx index d8a794a1..c7ce567f 100644 --- a/public/pages/workflow_detail/prototype/query_executor.tsx +++ b/public/pages/workflow_detail/prototype/query_executor.tsx @@ -16,28 +16,24 @@ import { USE_CASE, Workflow, getIndexName, - getSemanticSearchValues, + getNeuralSearchValues, } from '../../../../common'; import { searchIndex, useAppDispatch } from '../../../store'; import { getCore } from '../../../services'; -import { getFormattedJSONString } from './utils'; +import { + NeuralSparseValues, + SemanticSearchValues, + WorkflowValues, + getFormattedJSONString, +} from './utils'; interface QueryExecutorProps { workflow: Workflow; } -type WorkflowValues = { - modelId: string; -}; - -type SemanticSearchValues = WorkflowValues & { - inputField: string; - vectorField: string; -}; - type QueryGeneratorFn = ( queryText: string, - workflowValues: SemanticSearchValues + workflowValues: SemanticSearchValues | NeuralSparseValues ) => {}; /** @@ -187,9 +183,13 @@ export function QueryExecutor(props: QueryExecutorProps) { function getQueryGeneratorFn(workflow: Workflow): QueryGeneratorFn { let fn; switch (workflow.use_case) { - case USE_CASE.SEMANTIC_SEARCH: - default: { + case USE_CASE.SEMANTIC_SEARCH: { fn = () => generateSemanticSearchQuery; + break; + } + case USE_CASE.NEURAL_SPARSE_SEARCH: + default: { + fn = () => generateNeuralSparseQuery; } } return fn; @@ -201,7 +201,7 @@ function getWorkflowValues(workflow: Workflow): WorkflowValues { switch (workflow.use_case) { case USE_CASE.SEMANTIC_SEARCH: default: { - values = getSemanticSearchValues(workflow); + values = getNeuralSearchValues(workflow); } } return values; @@ -213,6 +213,7 @@ function generateSemanticSearchQuery( workflowValues: SemanticSearchValues ): {} { return { + // TODO: can make this configurable _source: { excludes: [`${workflowValues.vectorField}`], }, @@ -221,6 +222,7 @@ function generateSemanticSearchQuery( [workflowValues.vectorField]: { query_text: queryText, model_id: workflowValues.modelId, + // TODO: expose k as configurable k: 5, }, }, @@ -228,6 +230,27 @@ function generateSemanticSearchQuery( }; } +// utility fn to generate a neural sparse search query +function generateNeuralSparseQuery( + queryText: string, + workflowValues: NeuralSparseValues +): {} { + return { + // TODO: can make this configurable + _source: { + excludes: [`${workflowValues.vectorField}`], + }, + query: { + neural_sparse: { + [workflowValues.vectorField]: { + query_text: queryText, + model_id: workflowValues.modelId, + }, + }, + }, + }; +} + function processHits(hits: any[]): {}[] { return hits.map((hit) => hit._source); } diff --git a/public/pages/workflow_detail/prototype/utils.ts b/public/pages/workflow_detail/prototype/utils.ts index c4e3a1a9..920d5d65 100644 --- a/public/pages/workflow_detail/prototype/utils.ts +++ b/public/pages/workflow_detail/prototype/utils.ts @@ -4,9 +4,22 @@ */ /** - * Shared utility fns used in the prototyping page. + * Shared utility fns and constants used in the prototyping page. */ +// UTILITY FNS export function getFormattedJSONString(obj: {}): string { return Object.values(obj).length > 0 ? JSON.stringify(obj, null, '\t') : ''; } + +// CONSTANTS +export type WorkflowValues = { + modelId: string; +}; + +export type SemanticSearchValues = WorkflowValues & { + inputField: string; + vectorField: string; +}; + +export type NeuralSparseValues = SemanticSearchValues; diff --git a/public/pages/workflow_detail/utils/data_extractor_utils.ts b/public/pages/workflow_detail/utils/data_extractor_utils.ts index f4fd01bb..3a49c840 100644 --- a/public/pages/workflow_detail/utils/data_extractor_utils.ts +++ b/public/pages/workflow_detail/utils/data_extractor_utils.ts @@ -33,7 +33,9 @@ export function getIndexName(workflow: Workflow): string | undefined { } } -export function getSemanticSearchValues( +// Returns values for neural search use cases. Note many of them +// persist the same values to use during ingest and search, so we keep the naming general +export function getNeuralSearchValues( workflow: Workflow ): { modelId: string; inputField: string; vectorField: string } { const modelId = getModelId(workflow) as string; diff --git a/public/pages/workflow_detail/utils/workflow_to_template_utils.ts b/public/pages/workflow_detail/utils/workflow_to_template_utils.ts index 4db87cc4..534151d5 100644 --- a/public/pages/workflow_detail/utils/workflow_to_template_utils.ts +++ b/public/pages/workflow_detail/utils/workflow_to_template_utils.ts @@ -29,6 +29,12 @@ import { BERT_SENTENCE_TRANSFORMER, REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE, generateId, + NEURAL_SPARSE_TRANSFORMER, + NEURAL_SPARSE_DOC_TRANSFORMER, + NEURAL_SPARSE_TOKENIZER_TRANSFORMER, + REGISTER_LOCAL_SPARSE_ENCODING_MODEL_STEP_TYPE, + SparseEncodingProcessor, + IndexMappings, } from '../../../../common'; /** @@ -89,7 +95,12 @@ function toProvisionTemplateFlow( }); edges.forEach((edge) => { - templateEdges.push(toTemplateEdge(edge)); + // it may be undefined if the edge is not convertible + // (e.g., connecting to some meta/other UI component, like "document" or "query") + const templateEdge = toTemplateEdge(edge); + if (templateEdge) { + templateEdges.push(templateEdge); + } }); return { @@ -110,11 +121,13 @@ function toTemplateNodes( } } -function toTemplateEdge(flowEdge: ReactFlowEdge): TemplateEdge { - return { - source: flowEdge.source, - dest: flowEdge.target, - }; +function toTemplateEdge(flowEdge: ReactFlowEdge): TemplateEdge | undefined { + return isValidTemplateEdge(flowEdge) + ? { + source: flowEdge.source, + dest: flowEdge.target, + } + : undefined; } // General fn to process all ML transform nodes. Convert into a final @@ -124,12 +137,12 @@ function toTemplateEdge(flowEdge: ReactFlowEdge): TemplateEdge { function transformerToTemplateNodes( flowNode: ReactFlowComponent ): TemplateNode[] { - // TODO a few improvements to make here: + // TODO improvements to make here: // 1. Consideration of multiple ingest processors and how to collect them all, and finally create // a single ingest pipeline with all of them, in the same order as done on the UI - // 2. Support more than just text embedding transformers switch (flowNode.data.type) { case COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER: + case COMPONENT_CLASS.SPARSE_ENCODER_TRANSFORMER: default: { const { model, inputField, vectorField } = componentDataToFormik( flowNode.data @@ -141,6 +154,12 @@ function transformerToTemplateNodes( const modelId = model.id; const ingestPipelineName = generateId('ingest_pipeline'); + // register model workflow step type is different per use case + const registerModelStepType = + flowNode.data.type === COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER + ? REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE + : REGISTER_LOCAL_SPARSE_ENCODING_MODEL_STEP_TYPE; + let registerModelStep = undefined as | RegisterPretrainedModelNode | undefined; @@ -149,13 +168,17 @@ function transformerToTemplateNodes( ROBERTA_SENTENCE_TRANSFORMER, MPNET_SENTENCE_TRANSFORMER, BERT_SENTENCE_TRANSFORMER, + NEURAL_SPARSE_TRANSFORMER, + NEURAL_SPARSE_DOC_TRANSFORMER, + NEURAL_SPARSE_TOKENIZER_TRANSFORMER, ].find( // the model ID in the form will be the unique name of the pretrained model (model) => model.name === modelId ) as PretrainedSentenceTransformer; + registerModelStep = { - id: REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE, - type: REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE, + id: registerModelStepType, + type: registerModelStepType, user_inputs: { name: pretrainedModel.name, description: pretrainedModel.description, @@ -170,9 +193,35 @@ function transformerToTemplateNodes( // or directly from the user const finalModelId = registerModelStep !== undefined - ? `\${{${REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE}.model_id}}` + ? `\${{${registerModelStepType}.model_id}}` : modelId; + // processor is different per use case + const finalProcessor = + flowNode.data.type === COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER + ? ({ + text_embedding: { + model_id: finalModelId, + field_map: { + [inputField]: vectorField, + }, + }, + } as TextEmbeddingProcessor) + : ({ + sparse_encoding: { + model_id: finalModelId, + field_map: { + [inputField]: vectorField, + }, + }, + } as SparseEncodingProcessor); + + // ingest pipeline is different per use case + const finalIngestPipelineDescription = + flowNode.data.type === COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER + ? 'An ingest pipeline with a text embedding processor' + : 'An ingest pieline with a neural sparse encoding processor'; + const createIngestPipelineStep = { id: flowNode.data.id, type: CREATE_INGEST_PIPELINE_STEP_TYPE, @@ -182,20 +231,17 @@ function transformerToTemplateNodes( input_field: inputField, output_field: vectorField, configurations: { - description: 'An ingest pipeline with a text embedding processor.', - processors: [ - { - text_embedding: { - model_id: finalModelId, - field_map: { - [inputField]: vectorField, - }, - }, - } as TextEmbeddingProcessor, - ], + description: finalIngestPipelineDescription, + processors: [finalProcessor], }, }, } as CreateIngestPipelineNode; + if (registerModelStep !== undefined) { + createIngestPipelineStep.previous_node_inputs = { + ...createIngestPipelineStep.previous_node_inputs, + [registerModelStepType]: 'model_id', + }; + } return registerModelStep !== undefined ? [registerModelStep, createIngestPipelineStep] @@ -217,30 +263,22 @@ function indexerToTemplateNode( // TODO: remove hardcoded logic here that is assuming each indexer node has // exactly 1 directly connected create_ingest_pipeline predecessor node that // contains an inputField and vectorField - const directlyConnectedNodeId = getDirectlyConnectedNodes( - flowNode, - edges - )[0]; - const { inputField, vectorField } = getDirectlyConnectedNodeInputs( + const directlyConnectedNode = getDirectlyConnectedNodes( flowNode, prevNodes, edges - ); + )[0]; - return { - id: flowNode.data.id, - type: CREATE_INDEX_STEP_TYPE, - previous_node_inputs: { - [directlyConnectedNodeId]: 'pipeline_id', - }, - user_inputs: { - index_name: indexName, - configurations: { - settings: { - default_pipeline: `\${{${directlyConnectedNodeId}.pipeline_id}}`, - }, - mappings: { - properties: { + const { inputField, vectorField } = getNodeValues([ + directlyConnectedNode, + ]); + + // index mappings are different per use case + const finalIndexMappings = { + properties: + directlyConnectedNode.data.type === + COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER + ? { [vectorField]: { type: 'knn_vector', // TODO: remove hardcoding, fetch from the selected model @@ -256,8 +294,30 @@ function indexerToTemplateNode( [inputField]: { type: 'text', }, + } + : { + [vectorField]: { + type: 'rank_features', + }, + [inputField]: { + type: 'text', + }, }, + } as IndexMappings; + + return { + id: flowNode.data.id, + type: CREATE_INDEX_STEP_TYPE, + previous_node_inputs: { + [directlyConnectedNode.id]: 'pipeline_id', + }, + user_inputs: { + index_name: indexName, + configurations: { + settings: { + default_pipeline: `\${{${directlyConnectedNode.id}.pipeline_id}}`, }, + mappings: finalIndexMappings, }, }, }; @@ -265,18 +325,22 @@ function indexerToTemplateNode( } } -// Fetch all directly connected predecessor node inputs -function getDirectlyConnectedNodeInputs( +// Fetch all directly connected predecessor nodes +function getDirectlyConnectedNodes( node: ReactFlowComponent, prevNodes: ReactFlowComponent[], edges: ReactFlowEdge[] -): FormikValues { - const directlyConnectedNodeIds = getDirectlyConnectedNodes(node, edges); - const directlyConnectedNodes = prevNodes.filter((prevNode) => +): ReactFlowComponent[] { + const directlyConnectedNodeIds = getDirectlyConnectedNodeIds(node, edges); + return prevNodes.filter((prevNode) => directlyConnectedNodeIds.includes(prevNode.id) ); +} + +// Get all values for an arr of flow nodes +function getNodeValues(nodes: ReactFlowComponent[]): FormikValues { let values = {} as FormikValues; - directlyConnectedNodes.forEach((node) => { + nodes.forEach((node) => { values = { ...values, ...componentDataToFormik(node.data), @@ -285,8 +349,8 @@ function getDirectlyConnectedNodeInputs( return values; } -// Simple utility fn to fetch all direct predecessor node IDs for a given node -function getDirectlyConnectedNodes( +// Fetch all direct predecessor node IDs for a given node +function getDirectlyConnectedNodeIds( flowNode: ReactFlowComponent, edges: ReactFlowEdge[] ): string[] { @@ -298,3 +362,12 @@ function getDirectlyConnectedNodes( }); return incomingNodes; } + +function isValidTemplateEdge(flowEdge: ReactFlowEdge): boolean { + // TODO: may need to expand to handle multiple classes in the future (e.g., some 'query' component) + const invalidClass = COMPONENT_CLASS.DOCUMENT; + return ( + !flowEdge.sourceClasses?.includes(invalidClass) && + !flowEdge.targetClasses?.includes(invalidClass) + ); +} diff --git a/public/pages/workflow_detail/workspace/resizable_workspace.tsx b/public/pages/workflow_detail/workspace/resizable_workspace.tsx index 90cba2fb..6baa6e59 100644 --- a/public/pages/workflow_detail/workspace/resizable_workspace.tsx +++ b/public/pages/workflow_detail/workspace/resizable_workspace.tsx @@ -330,7 +330,7 @@ export function ResizableWorkspace(props: ResizableWorkspaceProps) { iconType="alert" style={{ marginBottom: '16px' }} > - Changes cannot be saved until the flow has first been + Changes cannot be saved until the workflow has first been deprovisioned. )} diff --git a/public/pages/workflows/new_workflow/utils.ts b/public/pages/workflows/new_workflow/utils.ts index 7cc5b4dc..3c43526c 100644 --- a/public/pages/workflows/new_workflow/utils.ts +++ b/public/pages/workflows/new_workflow/utils.ts @@ -19,6 +19,8 @@ import { COMPONENT_CLASS, START_FROM_SCRATCH_WORKFLOW_NAME, DEFAULT_NEW_WORKFLOW_NAME, + Document, + SparseEncoderTransformer, } from '../../../../common'; // Fn to produce the complete preset template with all necessary UI metadata. @@ -34,6 +36,10 @@ export function enrichPresetWorkflowWithUiMetadata( workspaceFlowState = fetchSemanticSearchWorkspaceFlow(); break; } + case USE_CASE.NEURAL_SPARSE_SEARCH: { + workspaceFlowState = fetchNeuralSparseSearchWorkspaceFlow(); + break; + } default: { workspaceFlowState = fetchEmptyWorkspaceFlow(); break; @@ -57,11 +63,13 @@ function fetchEmptyWorkspaceFlow(): WorkspaceFlowState { } function fetchSemanticSearchWorkspaceFlow(): WorkspaceFlowState { + const ingestId0 = generateId(COMPONENT_CLASS.DOCUMENT); const ingestId1 = generateId(COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER); const ingestId2 = generateId(COMPONENT_CLASS.KNN_INDEXER); const ingestGroupId = generateId(COMPONENT_CATEGORY.INGEST); - const searchGroupId = generateId(COMPONENT_CATEGORY.SEARCH); - const edgeId = generateId('edge'); + // const searchGroupId = generateId(COMPONENT_CATEGORY.SEARCH); + const edgeId0 = generateId('edge'); + const edgeId1 = generateId('edge'); const ingestNodes = [ { @@ -70,7 +78,7 @@ function fetchSemanticSearchWorkspaceFlow(): WorkspaceFlowState { type: NODE_CATEGORY.INGEST_GROUP, data: { label: COMPONENT_CATEGORY.INGEST }, style: { - width: 900, + width: 1300, height: 400, }, className: 'reactflow__group-node__ingest', @@ -79,8 +87,18 @@ function fetchSemanticSearchWorkspaceFlow(): WorkspaceFlowState { deletable: false, }, { - id: ingestId1, + id: ingestId0, position: { x: 100, y: 70 }, + data: initComponentData(new Document().toObj(), ingestId0), + type: NODE_CATEGORY.CUSTOM, + parentNode: ingestGroupId, + extent: 'parent', + draggable: false, + deletable: false, + }, + { + id: ingestId1, + position: { x: 500, y: 70 }, data: initComponentData( new TextEmbeddingTransformer().toObj(), ingestId1 @@ -93,7 +111,7 @@ function fetchSemanticSearchWorkspaceFlow(): WorkspaceFlowState { }, { id: ingestId2, - position: { x: 500, y: 70 }, + position: { x: 900, y: 70 }, data: initComponentData(new KnnIndexer().toObj(), ingestId2), type: NODE_CATEGORY.CUSTOM, parentNode: ingestGroupId, @@ -103,31 +121,154 @@ function fetchSemanticSearchWorkspaceFlow(): WorkspaceFlowState { }, ] as ReactFlowComponent[]; - const searchNodes = [ + // const searchNodes = [ + // { + // id: searchGroupId, + // position: { x: 400, y: 1000 }, + // type: NODE_CATEGORY.SEARCH_GROUP, + // data: { label: COMPONENT_CATEGORY.SEARCH }, + // style: { + // width: 900, + // height: 400, + // }, + // className: 'reactflow__group-node__search', + // selectable: true, + // draggable: false, + // deletable: false, + // }, + // ] as ReactFlowComponent[]; + const searchNodes = [] as ReactFlowComponent[]; + + return { + nodes: [...ingestNodes, ...searchNodes], + edges: [ + { + id: edgeId0, + key: edgeId0, + source: ingestId0, + target: ingestId1, + sourceClasses: ingestNodes.find((node) => node.id === ingestId0)?.data + .baseClasses, + targetClasses: ingestNodes.find((node) => node.id === ingestId1)?.data + .baseClasses, + markerEnd: { + type: MarkerType.ArrowClosed, + width: 20, + height: 20, + }, + zIndex: 2, + deletable: false, + }, + { + id: edgeId1, + key: edgeId1, + source: ingestId1, + target: ingestId2, + sourceClasses: ingestNodes.find((node) => node.id === ingestId1)?.data + .baseClasses, + targetClasses: ingestNodes.find((node) => node.id === ingestId2)?.data + .baseClasses, + markerEnd: { + type: MarkerType.ArrowClosed, + width: 20, + height: 20, + }, + zIndex: 2, + deletable: false, + }, + ] as ReactFlowEdge[], + }; +} + +function fetchNeuralSparseSearchWorkspaceFlow(): WorkspaceFlowState { + const ingestId0 = generateId(COMPONENT_CLASS.DOCUMENT); + const ingestId1 = generateId(COMPONENT_CLASS.SPARSE_ENCODER_TRANSFORMER); + const ingestId2 = generateId(COMPONENT_CLASS.KNN_INDEXER); + const ingestGroupId = generateId(COMPONENT_CATEGORY.INGEST); + const edgeId0 = generateId('edge'); + const edgeId1 = generateId('edge'); + + const ingestNodes = [ { - id: searchGroupId, - position: { x: 400, y: 1000 }, - type: NODE_CATEGORY.SEARCH_GROUP, - data: { label: COMPONENT_CATEGORY.SEARCH }, + id: ingestGroupId, + position: { x: 400, y: 400 }, + type: NODE_CATEGORY.INGEST_GROUP, + data: { label: COMPONENT_CATEGORY.INGEST }, style: { - width: 900, + width: 1300, height: 400, }, - className: 'reactflow__group-node__search', + className: 'reactflow__group-node__ingest', selectable: true, draggable: false, deletable: false, }, + { + id: ingestId0, + position: { x: 100, y: 70 }, + data: initComponentData(new Document().toObj(), ingestId0), + type: NODE_CATEGORY.CUSTOM, + parentNode: ingestGroupId, + extent: 'parent', + draggable: false, + deletable: false, + }, + { + id: ingestId1, + position: { x: 500, y: 70 }, + data: initComponentData( + new SparseEncoderTransformer().toObj(), + ingestId1 + ), + type: NODE_CATEGORY.CUSTOM, + parentNode: ingestGroupId, + extent: 'parent', + draggable: false, + deletable: false, + }, + { + id: ingestId2, + position: { x: 900, y: 70 }, + data: initComponentData(new KnnIndexer().toObj(), ingestId2), + type: NODE_CATEGORY.CUSTOM, + parentNode: ingestGroupId, + extent: 'parent', + draggable: false, + deletable: false, + }, ] as ReactFlowComponent[]; + const searchNodes = [] as ReactFlowComponent[]; + return { nodes: [...ingestNodes, ...searchNodes], edges: [ { - id: edgeId, - key: edgeId, + id: edgeId0, + key: edgeId0, + source: ingestId0, + target: ingestId1, + sourceClasses: ingestNodes.find((node) => node.id === ingestId0)?.data + .baseClasses, + targetClasses: ingestNodes.find((node) => node.id === ingestId1)?.data + .baseClasses, + markerEnd: { + type: MarkerType.ArrowClosed, + width: 20, + height: 20, + }, + zIndex: 2, + deletable: false, + }, + { + id: edgeId1, + key: edgeId1, source: ingestId1, target: ingestId2, + sourceClasses: ingestNodes.find((node) => node.id === ingestId1)?.data + .baseClasses, + targetClasses: ingestNodes.find((node) => node.id === ingestId2)?.data + .baseClasses, markerEnd: { type: MarkerType.ArrowClosed, width: 20, diff --git a/public/pages/workflows/workflow_list/columns.tsx b/public/pages/workflows/workflow_list/columns.tsx index 45615cb4..eefda067 100644 --- a/public/pages/workflows/workflow_list/columns.tsx +++ b/public/pages/workflows/workflow_list/columns.tsx @@ -16,6 +16,7 @@ export const columns = (actions: any[]) => [ { field: 'name', name: 'Name', + width: '20%', sortable: true, render: (name: string, workflow: Workflow) => ( {name} @@ -29,6 +30,7 @@ export const columns = (actions: any[]) => [ { field: 'use_case', name: 'Type', + width: '30%', sortable: true, }, { diff --git a/public/utils/constants.ts b/public/utils/constants.ts index 74dd67b7..8450fd2b 100644 --- a/public/utils/constants.ts +++ b/public/utils/constants.ts @@ -49,6 +49,9 @@ export enum COMPONENT_CLASS { JSON_TO_JSON_TRANSFORMER = 'json_to_json_transformer', ML_TRANSFORMER = 'ml_transformer', TEXT_EMBEDDING_TRANSFORMER = 'text_embedding_transformer', + SPARSE_ENCODER_TRANSFORMER = 'sparse_encoder_transformer', // Query-related classes QUERY = 'query', + // Document-related classes + DOCUMENT = 'document', } diff --git a/server/resources/templates/neural_sparse_search.json b/server/resources/templates/neural_sparse_search.json new file mode 100644 index 00000000..3778b2e5 --- /dev/null +++ b/server/resources/templates/neural_sparse_search.json @@ -0,0 +1,12 @@ +{ + "name": "Neural Sparse Search", + "description": "A basic workflow containing the ingest pipeline and index configurations for performing neural sparse search", + "use_case": "NEURAL_SPARSE_SEARCH", + "version": { + "template": "1.0.0", + "compatibility": [ + "2.13.0", + "3.0.0" + ] + } +} \ No newline at end of file diff --git a/server/resources/templates/semantic_search.json b/server/resources/templates/semantic_search.json index 83995824..6436c279 100644 --- a/server/resources/templates/semantic_search.json +++ b/server/resources/templates/semantic_search.json @@ -1,11 +1,11 @@ { "name": "Semantic Search", - "description": "This semantic search workflow includes the essential ingestion and search pipelines that covers the most common search use cases.", + "description": "A basic workflow containing the ingest pipeline and index configurations for performing semantic search", "use_case": "SEMANTIC_SEARCH", "version": { "template": "1.0.0", "compatibility": [ - "2.12.0", + "2.13.0", "3.0.0" ] }