huggingface · Kami-chanw · Aug 12, 2024
diff --git a/metrics/CIDEr/CIDEr.py b/metrics/CIDEr/CIDEr.py
@@ -0,0 +1,155 @@
+from typing import List
+import datasets
+import evaluate
+import os
+import tempfile
+import subprocess
+
+from pycocoevalcap.cider.cider import CiderScorer
+
+_DESCRIPTION = """
+The CIDEr (Consensus-based Image Description Evaluation) metric is used to evaluate the quality of image captions generated by models in image captioning tasks. 
+It measures how well the generated caption matches human-written reference captions by considering both the frequency and the relevance of words or phrases.
+Here is the formula for the CIDEr metric in LaTeX code:
+
+$
+\\text{CIDEr}(c_i, C) = \\frac{1}{N} \\sum_{n=1}^{N} w_n \\cdot \\frac{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, c_i)}{\\sum_{j=1}^{m} \\text{IDF}(g_j) \\cdot \\text{TF}(g_j, C)}
+$
+
+where:
+- $ c_i $ is the candidate caption,
+- $ C $ is the set of reference captions,
+- $ N $ is the number of n-grams (typically 1 to 4),
+- $ w_n $ is the weight for the n-gram,
+- $ g_j $ represents the j-th n-gram,
+- $ \\text{TF}(g_j, c_i) $ is the term frequency of the n-gram $ g_j $ in the candidate caption $ c_i $,
+- $ \\text{TF}(g_j, C) $ is the term frequency of the n-gram $ g_j $ in the reference captions $ C $,
+- $ \\text{IDF}(g_j) $ is the inverse document frequency of the n-gram $ g_j $.
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions (`list` of `str`): Predicted captions.
+    references (`list` of `str` lists): Ground truth captions. 
+    n (int, defaults to 4): Number of ngrams for which (ngram) representation is calculated.
+    sigma (float, defaults to 6.0): The standard deviation parameter for gaussian penalty.
+
+Returns:
+    CIDEr (`float`): CIDEr value. Minimum possible value is 0. Maximum possible value is 100.
+
+"""
+
+
+_CITATION = """
+@inproceedings{vedantam2015cider,
+  title={Cider: Consensus-based image description evaluation},
+  author={Vedantam, Ramakrishna and Lawrence Zitnick, C and Parikh, Devi},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4566--4575},
+  year={2015}
+}
+"""
+
+_URLS = {
+    "stanford-corenlp": "https://repo1.maven.org/maven2/edu/stanford/nlp/stanford-corenlp/3.4.1/stanford-corenlp-3.4.1.jar"
+}
+
+
+def tokenize(tokenizer_path: str, predictions: List[str], references: List[List[str]]):
+    PUNCTUATIONS = [
+        "''",
+        "'",
+        "``",
+        "`",
+        "-LRB-",
+        "-RRB-",
+        "-LCB-",
+        "-RCB-",
+        ".",
+        "?",
+        "!",
+        ",",
+        ":",
+        "-",
+        "--",
+        "...",
+        ";",
+    ]
+
+    cmd = [
+        "java",
+        "-cp",
+        tokenizer_path,
+        "edu.stanford.nlp.process.PTBTokenizer",
+        "-preserveLines",
+        "-lowerCase",
+    ]
+
+    sentences = "\n".join(
+        [
+            s.replace("\n", " ")
+            for s in predictions + [ref for refs in references for ref in refs]
+        ]
+    )
+
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        f.write(sentences.encode())
+
+    cmd.append(f.name)
+    p_tokenizer = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
+    token_lines = token_lines.decode()
+    lines = [
+        " ".join([w for w in line.rstrip().split(" ") if w not in PUNCTUATIONS])
+        for line in token_lines.split("\n")
+    ]
+
+    os.remove(f.name)
+
+    pred_size = len(predictions)
+    ref_sizes = [len(ref) for ref in references]
+
+    predictions = lines[:pred_size]
+    start = pred_size
+    references = []
+    for size in ref_sizes:
+        references.append(lines[start : start + size])
+        start += size
+
+    return predictions, references
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class CIDEr(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Sequence(
+                        datasets.Value("string", id="sequence"), id="references"
+                    ),
+                }
+            ),
+            reference_urls=[
+                "https://github.com/salaniz/pycocoevalcap",
+                "https://github.com/tylin/coco-caption",
+            ],
+        )
+
+    def _download_and_prepare(self, dl_manager):
+        self.tokenizer_path = dl_manager.download(_URLS["stanford-corenlp"])
+
+    def _compute(self, predictions, references, n=4, sigma=6.0):
+        predications, references = tokenize(
+            self.tokenizer_path, predictions, references
+        )
+        scorer = CiderScorer(n=n, sigma=sigma)
+        for pred, refs in zip(predications, references):
+            scorer += (pred, refs)
+        score, scores = scorer.compute_score()
+        return {"CIDEr": score}
diff --git a/metrics/CIDEr/README.md b/metrics/CIDEr/README.md
@@ -0,0 +1,75 @@
+---
+title: CIDEr
+emoji: 🐨
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+---
+
+# CIDEr Metric for Image Captioning Evaluation
+
+## CIDEr Description
+The CIDEr (Consensus-based Image Description Evaluation) metric is widely used in image captioning tasks to evaluate the quality of generated captions. The metric assesses how well the generated caption aligns with human-written reference captions by considering both the frequency and relevance of words or phrases. The score is computed using a weighted combination of n-gram precision, accounting for the frequency of each n-gram in the reference set.
+
+The formula for the CIDEr metric is as follows:
+
+$
+\text{CIDEr}(c_i, C) = \frac{1}{N} \sum_{n=1}^{N} w_n \cdot \frac{\sum_{j=1}^{m} \text{IDF}(g_j) \cdot \text{TF}(g_j, c_i)}{\sum_{j=1}^{m} \text{IDF}(g_j) \cdot \text{TF}(g_j, C)}
+$
+
+where:
+- $ c_i $ is the candidate caption,
+- $ C $ is the set of reference captions,
+- $ N $ is the number of n-grams (typically 1 to 4),
+- $ w_n $ is the weight for the n-gram,
+- $ g_j $ represents the j-th n-gram,
+- $ \text{TF}(g_j, c_i) $ is the term frequency of the n-gram $ g_j $ in the candidate caption $ c_i $,
+- $ \text{TF}(g_j, C) $ is the term frequency of the n-gram $ g_j $ in the reference captions $ C $,
+- $ \text{IDF}(g_j) $ is the inverse document frequency of the n-gram $ g_j $.
+
+## How to Use
+To use the CIDEr metric, you need to initialize the `CIDEr` class and provide the predicted and reference captions. The metric will tokenize the captions and compute the CIDEr score.
+
+### Inputs
+- **predictions** *(list of str)*: The list of predicted captions generated by the model.
+- **references** *(list of list of str)*: The list of lists, where each list contains the reference captions corresponding to each prediction.
+- **n** *(int, optional, defaults to 4)*: Number of n-grams for which (ngram) representation is calculated.
+- **sigma** *(float, optional, defaults to 6.0)*: The standard deviation parameter for the Gaussian penalty.
+
+### Output Values
+- **CIDEr** *(float)*: The computed CIDEr score, which typically ranges between 0 and 100. Higher scores indicate better alignment between the predicted and reference captions.
+
+### Examples
+
+```python
+>>> from evaluate import load
+>>> CIDEr = load("Kamichanw/CIDEr")
+>>> predictions = ["A cat sits on a mat."]
+>>> references = [["A cat is sitting on a mat.", "A feline rests on the mat."]]
+>>> score = cider_metric.compute(predictions=predictions, references=references)
+>>> print(score['CIDEr'])
+0.0
+```
+
+## Limitations and Bias
+The CIDEr metric primarily focuses on the n-gram overlap between predicted and reference captions. It may not adequately capture semantic nuances or variations in phrasing that still convey the same meaning. Moreover, CIDEr tends to favor longer captions with more word overlap, potentially biasing against concise but accurate captions.
+
+## Citation
+If you use the CIDEr metric in your research, please cite the original paper:
+
+```bibtex
+@inproceedings{vedantam2015cider,
+  title={Cider: Consensus-based image description evaluation},
+  author={Vedantam, Ramakrishna and Lawrence Zitnick, C and Parikh, Devi},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4566--4575},
+  year={2015}
+}
+```
+
+## Further References
+- [CIDEr GitHub Repository](https://github.com/tylin/coco-caption)
+- [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/)
diff --git a/metrics/CIDEr/app.py b/metrics/CIDEr/app.py
@@ -0,0 +1,6 @@
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("CIDEr")
+launch_gradio_widget(module)
diff --git a/metrics/CIDEr/requirements.txt b/metrics/CIDEr/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/huggingface/evaluate@main
+pycocoevalcap
diff --git a/metrics/vqa_accuracy/README.md b/metrics/vqa_accuracy/README.md
@@ -0,0 +1,66 @@
+---
+title: VQA Accuracy
+emoji: 🔥
+colorFrom: indigo
+colorTo: gray
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+---
+
+# VQAaccuracy Metric Card
+
+## Metric Description
+The **VQAaccuracy** metric is used for evaluating the accuracy of visual question answering (VQA) models. It is designed to be robust to the variability in how different humans may phrase their answers. The accuracy for an answer (`ans`) predicted by the model is calculated as:
+$ \text{Acc}(ans) = \min\left(\frac{\# \text{humans that said } ans}{3}, 1\right) $
+This metric aligns with the official VQA evaluation by averaging the machine accuracies over all possible sets of human annotators.
+
+## How to Use
+The **VQAAccuracy** metric can be used to evaluate the performance of a VQA model by comparing the predicted answers to a set of ground truth answers. The metric can be integrated into your evaluation pipeline as follows:
+
+### Inputs
+- **predictions** (`list` of `str`): The predicted answers generated by the VQA model.
+- **references** (`list` of `str` lists): The ground truth answers corresponding to each question.
+- **answer_types** (`list` of `str`, *optional*): The types of answers corresponding to each question. If not provided, defaults to `None`.
+- **question_types** (`list` of `str`, *optional*): The types of questions corresponding to each question. If not provided, defaults to `None`.
+
+### Output Values
+The output of this metric is a dictionary containing:
+- **overall** (`float`): The overall VQA accuracy, rounded to the specified precision.
+- **perAnswerType** (`dict`, *optional*): The VQA accuracy for each answer type, if provided.
+- **perQuestionType** (`dict`, *optional*): The VQA accuracy for each question type, if provided.
+
+The accuracy values range from 0 to 100, with higher values indicating better performance.
+
+### Examples
+Here is an example of how to use the **VQAaccuracy** metric:
+
+```python
+>>> from evaluate import load
+>>> vqa_accuracy = load("Kamichanw/vqa_accuracy")
+>>> predictions = ["yes", "2", "blue"]
+>>> references = [["yes", "yeah", "yep"], ["2", "two"], ["blue", "bluish"]]
+>>> results = vqa_accuracy.compute(predictions=predictions, references=references)
+>>> print(results)
+{"overall": 24.07}
+```
+
+## Limitations and Bias
+The **VQAAccuracy** metric is dependent on the consistency and quality of the ground truth answers provided. Variability in human annotations can affect the accuracy scores. Additionally, the metric is designed specifically for the VQA task and may not generalize well to other types of question-answering models.
+
+## Citation
+If you use the **VQAAccuracy** metric in your work, please cite the original VQA paper:
+
+```bibtex
+@InProceedings{VQA,
+author      = {Stanislaw Antol and Aishwarya Agrawal and Jiasen Lu and Margaret Mitchell and Dhruv Batra and C. Lawrence Zitnick and Devi Parikh},
+title       = {VQA: Visual Question Answering},
+booktitle   = {International Conference on Computer Vision (ICCV)},
+year        = {2015},
+}
+```
+
+## Further References
+- [VQA Evaluation](https://visualqa.org/evaluation.html)
+- [VQA GitHub Repository](https://github.com/GT-Vision-Lab/VQA)
diff --git a/metrics/vqa_accuracy/app.py b/metrics/vqa_accuracy/app.py
@@ -0,0 +1,6 @@
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("vqa_accuracy")
+launch_gradio_widget(module)
diff --git a/metrics/vqa_accuracy/requirements.txt b/metrics/vqa_accuracy/requirements.txt
@@ -0,0 +1 @@
+git+https://github.com/huggingface/evaluate@main