diff --git a/fastrepl/runner.py b/fastrepl/runner.py
index e4965de..702d8c9 100644
--- a/fastrepl/runner.py
+++ b/fastrepl/runner.py
@@ -25,19 +25,11 @@ def __init__(
         dataset: Dataset,
         output_feature="result",
     ) -> None:
-        self._evaluator = evaluator
-        self._dataset = dataset
-
+        self._output_feature = output_feature
         self._input_features = [
             param for param in inspect.signature(evaluator.run).parameters.keys()
         ]
-        self._output_feature = output_feature
 
-    def _validate(
-        self,
-        evaluator: fastrepl.Evaluator,
-        dataset: Dataset,
-    ) -> None:
         if any(feature not in dataset.column_names for feature in self._input_features):
             eval_name = type(evaluator).__name__
 
@@ -45,8 +37,8 @@ def _validate(
                 f"{eval_name} requires {self._input_features}, but the provided dataset has {dataset.column_names}"
             )
 
-    def _run_eval(self, **kwargs) -> Optional[Any]:
-        return self._evaluator.run(**kwargs)
+        self._evaluator = evaluator
+        self._dataset = dataset
 
     def _run(self, progress: Progress, task_id: TaskID) -> List[Optional[Any]]:
         results = []
@@ -54,7 +46,7 @@ def _run(self, progress: Progress, task_id: TaskID) -> List[Optional[Any]]:
         with ThreadPool(NUM_THREADS) as pool:
             futures = [
                 pool.apply_async(
-                    self._run_eval,
+                    self._evaluator.run,
                     kwds={
                         feature: value
                         for feature, value in zip(self._input_features, values)
@@ -71,8 +63,6 @@ def _run(self, progress: Progress, task_id: TaskID) -> List[Optional[Any]]:
         return results
 
     def run(self, num=1) -> Dataset:
-        self._validate(self._evaluator, self._dataset)
-
         with Progress() as progress:
             msg = "[cyan]Processing..."
             task_id = progress.add_task(msg, total=len(self._dataset) * num)
diff --git a/fastrepl/utils/kappa.py b/fastrepl/utils/kappa.py
index 3ff04b3..d32b7f1 100644
--- a/fastrepl/utils/kappa.py
+++ b/fastrepl/utils/kappa.py
@@ -1,34 +1,41 @@
-from typing import List, Any
+from typing import List, Any, cast
 
 from sklearn.metrics import confusion_matrix
 from sklearn.preprocessing import LabelEncoder
-from statsmodels.stats.inter_rater import cohens_kappa
+from statsmodels.stats.inter_rater import cohens_kappa, fleiss_kappa, aggregate_raters
 
 
-def kappa(*predictions: List[Any]) -> float:
+def kappa(predictions: List[List[Any]]) -> float:
     if len(predictions) < 2:
         raise ValueError
-    if len(predictions) > 2:
-        raise NotImplementedError
-
-    # TODO: We only support cohens_kappa for now
-    assert len(predictions) == 2
-
-    if len(predictions[0]) == 0 or len(predictions[1]) == 0:
+    if any(len(ps) == 0 for ps in predictions):
         raise ValueError
 
+    # TODO: hacky none-handling
     if isinstance(predictions[0][0], str):
-        # TODO: workaround for None
-        a = ["" if p is None else p for p in predictions[0]]
-        b = ["" if p is None else p for p in predictions[1]]
+        for ps in predictions:
+            ps = ["" if p is None else p for p in ps]
 
         le = LabelEncoder()
-        le.fit(list(set(a + b)))
+        le.fit(list(set([p for ps in predictions for p in ps])))
 
-        a, b = le.transform(a), le.transform(b)
+        predictions = [le.transform(p) for p in predictions]
     else:
-        # TODO: workaround for None
-        a = [-1 if p is None else p for p in predictions[0]]
-        b = [-1 if p is None else p for p in predictions[1]]
+        predictions = [[-1 if p is None else p for p in ps] for ps in predictions]
+
+    if len(predictions) == 2:
+        return _cohens_kappa(predictions[0], predictions[1])
+    return _fleiss_kappa(predictions)
+
+
+def _cohens_kappa(a: List[Any], b: List[Any]) -> float:
+    return cohens_kappa(
+        table=confusion_matrix(a, b),
+        return_results=False,
+    )
+
 
-    return cohens_kappa(table=confusion_matrix(a, b), return_results=False)
+def _fleiss_kappa(predictions: List[List[Any]]) -> float:
+    input = list(zip(*predictions))  # transpose
+    table, _ = aggregate_raters(input)
+    return fleiss_kappa(table)
diff --git a/tests/unit/test_kappa.py b/tests/unit/test_kappa.py
index 16c985e..b8e358e 100644
--- a/tests/unit/test_kappa.py
+++ b/tests/unit/test_kappa.py
@@ -1,6 +1,5 @@
 import pytest
 
-from sklearn.metrics import confusion_matrix
 from statsmodels.stats.inter_rater import cohens_kappa, fleiss_kappa, aggregate_raters
 
 from fastrepl.utils import kappa
@@ -21,7 +20,13 @@ def test_basic2(self):
 
 
 class TestFleissKappa:
-    def test_aggregate_raters(self):
+    def test_aggregate_raters_1(self):
+        table, categories = aggregate_raters([[0, 1, 2], [1, 0, 1]])
+
+        assert (table == [[1, 1, 1], [1, 2, 0]]).all()
+        assert (categories == [0, 1, 2]).all()
+
+    def test_aggregate_raters_2(self):
         table, categories = aggregate_raters(
             [[0, 1, 2], [1, 0, 1], [2, 2, 0], [1, 0, 2]]
         )
@@ -29,6 +34,23 @@ def test_aggregate_raters(self):
         assert (table == [[1, 1, 1], [1, 2, 0], [1, 0, 2], [1, 1, 1]]).all()
         assert (categories == [0, 1, 2]).all()
 
+    def test_basic(self):
+        table, _ = aggregate_raters(
+            [  # Note that this is result of 3 raters
+                [1, 1, 1],
+                [1, 1, 1],
+                [1, 1, 1],
+                [1, 1, 1],
+                [3, 3, 2],
+                [0, 0, 0],
+                [0, 0, 0],
+                [1, 1, 1],
+            ]
+        )
+        result = fleiss_kappa(table)
+
+        assert result == pytest.approx(0.84516, abs=1e-5)
+
 
 @pytest.mark.parametrize(
     "predictions, result",
@@ -55,7 +77,34 @@ def test_aggregate_raters(self):
             ],
             0.499,
         ),
+    ]
+    + [
+        ([[1, None], [1, 2], [None, 2]], 0),
+        ([[1, 2, 3], [1, 2, 3], [1, 2, 3]], 1.0),
+        ([[1, 2, 3], [1, 1, 3], [1, 3, 3]], 0.437),
+        (
+            [
+                [1, 1, 1, 1, 3, 0, 0, 1],
+                [1, 1, 1, 1, 3, 0, 0, 1],
+                [1, 1, 1, 1, 2, 0, 0, 1],
+            ],
+            0.845,
+        ),
+        (
+            [
+                ["POSITIVE", "NEGATIVE", "POSITIVE"],
+                ["NEGATIVE", "POSITIVE", "NEGATIVE"],
+            ],
+            -0.799,
+        ),
+        (
+            [
+                ["POSITIVE", "NEGATIVE", "POSITIVE"],
+                ["POSITIVE", "NEGATIVE", "POSITIVE"],
+            ],
+            1.0,
+        ),
     ],
 )
 def test_kappa(predictions, result):
-    assert kappa(*predictions) == pytest.approx(result, abs=1e-3)
+    assert kappa(predictions) == pytest.approx(result, abs=1e-3)
diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py
index 81c0ea8..51b3b9e 100644
--- a/tests/unit/test_runner.py
+++ b/tests/unit/test_runner.py
@@ -63,4 +63,4 @@ def test_validation(self):
         )
 
         with pytest.raises(ValueError):
-            fastrepl.LocalRunner(evaluator=eval, dataset=ds).run()
+            fastrepl.LocalRunner(evaluator=eval, dataset=ds)