numaproj · dpadhiar · Sep 19, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
@@ -174,6 +174,10 @@ func ValidatePipeline(pl *dfv1.Pipeline) error {
 		return fmt.Errorf("not all the vertex names are defined in edges")
 	}
 
+	if isAForest(pl) {
+		return fmt.Errorf("invalid pipeline, cannot be disjointed")
+	}
+
 	// Prevent pipelines with Cycles in the case that there is a Reduce Vertex at the point of the cycle or to the right of it.
 	// Whenever there's a cycle, there will inherently be "late data", and we don't want late data for a Reduce Vertex, which may
 	// have already "closed the book" on the data's time window.
@@ -472,3 +476,40 @@ func toVerticesMappedByFrom(edges []dfv1.Edge, verticesByName map[string]*dfv1.A
 	}
 	return mappedEdges, nil
 }
+
+// isAForest determines if the pipeline is a disjointed graph ie. multiple pipelines defined in the spec
+func isAForest(pl *dfv1.Pipeline) bool {
+
+	visited := map[string]struct{}{}
+	buildVisitedMap(pl.Spec.Vertices[0].Name, visited, pl)
+
+	// if we have not visited every vertex in the graph, it is a forest
+	return len(visited) != len(pl.Spec.Vertices)
+
+}
+
+// buildVisitedMap is a helper function that traverses the pipeline using DFS
+// This is a recursive function. Each iteration we are building our visited map to check in the parent function.
+func buildVisitedMap(vtxName string, visited map[string]struct{}, pl *dfv1.Pipeline) {
+
+	visited[vtxName] = struct{}{}
+
+	// construct list of all to and from vertices
+	neighbors := make(map[string]string)
+	toEdges := pl.GetToEdges(vtxName)
+	fromEdges := pl.GetFromEdges(vtxName)
+	for _, e := range toEdges {
+		neighbors[e.To] = e.To
+	}
+	for _, e := range fromEdges {
+		neighbors[e.From] = e.From
+	}
+
+	// visit all to and from vertices
+	for _, v := range neighbors {
+		if _, alreadyVisited := visited[v]; !alreadyVisited {
+			buildVisitedMap(v, visited, pl)
+		}
+	}
+
+}
@@ -163,6 +163,57 @@ var (
 			},
 		},
 	}
+
+	testForestPipeline = &dfv1.Pipeline{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-pl",
+			Namespace: "test-ns",
+		},
+		Spec: dfv1.PipelineSpec{
+			Vertices: []dfv1.AbstractVertex{
+				{
+					Name: "input",
+					Source: &dfv1.Source{
+						UDTransformer: &dfv1.UDTransformer{
+							Builtin: &dfv1.Transformer{Name: "filter"},
+						}},
+				},
+				{
+					Name: "input-1",
+					Source: &dfv1.Source{
+						UDTransformer: &dfv1.UDTransformer{
+							Builtin: &dfv1.Transformer{Name: "filter"},
+						}},
+				},
+				{
+					Name: "p1",
+					UDF: &dfv1.UDF{
+						Builtin: &dfv1.Function{Name: "cat"},
+					},
+				},
+				{
+					Name: "p2",
+					UDF: &dfv1.UDF{
+						Builtin: &dfv1.Function{Name: "cat"},
+					},
+				},
+				{
+					Name: "output",
+					Sink: &dfv1.Sink{},
+				},
+				{
+					Name: "output-1",
+					Sink: &dfv1.Sink{},
+				},
+			},
+			Edges: []dfv1.Edge{
+				{From: "input", To: "p1"},
+				{From: "p1", To: "output"},
+				{From: "input-1", To: "p2"},
+				{From: "p2", To: "output-1"},
+			},
+		},
+	}
 )
 
 func TestValidatePipeline(t *testing.T) {
@@ -273,6 +324,52 @@ func TestValidatePipeline(t *testing.T) {
 		assert.Contains(t, err.Error(), "can not specify both builtin function, and a customized image")
 	})
 
+	t.Run("forest - two pipelines with 1 source/sink", func(t *testing.T) {
+		testObj := testForestPipeline.DeepCopy()
+		err := ValidatePipeline(testObj)
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "invalid pipeline")
+	})
+
+	t.Run("forest - second pipeline has no sink", func(t *testing.T) {
+		testObj := testForestPipeline.DeepCopy()
+		testObj.Spec.Vertices[5].Sink = nil
+		testObj.Spec.Vertices[5].UDF = &dfv1.UDF{}
+		err := ValidatePipeline(testObj)
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "invalid vertex")
+	})
+
+	t.Run("forest - two pipelines with multiple sources/sinks", func(t *testing.T) {
+		testObj := testForestPipeline.DeepCopy()
+		testObj.Spec.Vertices = append(testObj.Spec.Vertices, dfv1.AbstractVertex{Name: "input-2", Source: &dfv1.Source{}})
+		testObj.Spec.Vertices = append(testObj.Spec.Vertices, dfv1.AbstractVertex{Name: "output-2", Sink: &dfv1.Sink{}})
+		testObj.Spec.Edges = append(testObj.Spec.Edges, dfv1.Edge{From: "input-2", To: "p1"})
+		testObj.Spec.Edges = append(testObj.Spec.Edges, dfv1.Edge{From: "p2", To: "output-2"})
+		err := ValidatePipeline(testObj)
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "invalid pipeline")
+	})
+
+	t.Run("forest - pipelines have cycles", func(t *testing.T) {
+		testObj := testForestPipeline.DeepCopy()
+		testObj.Spec.Edges = append(testObj.Spec.Edges, dfv1.Edge{From: "p1", To: "p1"})
+		testObj.Spec.Edges = append(testObj.Spec.Edges, dfv1.Edge{From: "p2", To: "p2"})
+		err := ValidatePipeline(testObj)
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "invalid pipeline")
+	})
+
+	t.Run("valid pipeline with multiple sinks/sources", func(t *testing.T) {
+		testObj := testPipeline.DeepCopy()
+		testObj.Spec.Vertices = append(testObj.Spec.Vertices, dfv1.AbstractVertex{Name: "input-1", Source: &dfv1.Source{}})
+		testObj.Spec.Vertices = append(testObj.Spec.Vertices, dfv1.AbstractVertex{Name: "output-1", Sink: &dfv1.Sink{}})
+		testObj.Spec.Edges = append(testObj.Spec.Edges, dfv1.Edge{From: "input-1", To: "p1"})
+		testObj.Spec.Edges = append(testObj.Spec.Edges, dfv1.Edge{From: "p1", To: "output-1"})
+		err := ValidatePipeline(testObj)
+		assert.NoError(t, err)
+	})
+
 	t.Run("edge - invalid vertex name", func(t *testing.T) {
 		testObj := testPipeline.DeepCopy()
 		testObj.Spec.Edges = append(testObj.Spec.Edges, dfv1.Edge{From: "a", To: "b"})