Fix mturk emoji

stanford-crfm · Aug 4, 2023 · 02d46d3 · 02d46d3
1 parent d52b890
commit 02d46d3
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 4 deletions.
diff --git a/src/helm/benchmark/metrics/summarization_critique_metrics.py b/src/helm/benchmark/metrics/summarization_critique_metrics.py
@@ -73,6 +73,7 @@ def evaluate_generation(
         if len(request_state.result.completions) != 1:
             raise ValueError("SummarizationCritiqueMetric only supports a single generation per instance")
         summary = request_state.result.completions[0].text
+        summary += "😀"
         request = CritiqueRequest(
             self._template, fields={"original_text": request_state.instance.input.text, "summary": summary}
         )

diff --git a/src/helm/proxy/clients/mechanical_turk_critique_exporter.py b/src/helm/proxy/clients/mechanical_turk_critique_exporter.py
@@ -9,6 +9,7 @@
 from helm.common.critique_request import CritiqueQuestionTemplate, CritiqueRequest, CritiqueTaskTemplate, QuestionType
 from helm.common.general import ensure_directory_exists
 from helm.common.hierarchical_logger import hlog
+from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters
 
 
 def _indent_to_level(text: str, level: int) -> str:
@@ -45,14 +46,58 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
                 return valid;
             }
 
-            window.onload = function() {
+            document.addEventListener("DOMContentLoaded", function(event) {
                 document.querySelector('crowd-form').onsubmit = function(e) {
                     if (!validateForm()) {
                         alert("Please answer all the questions in order to submit.");
                         e.preventDefault();
                     }
                 }
             }
+        </script>
+        <script>
+            /**
+            * utf8ByteArrayToString() copied from:
+            *     https://github.com/google/closure-library/blob/e877b1eac410c0d842bcda118689759512e0e26f/closure/goog/crypt/crypt.js
+            *
+            * Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
+            * @param {Uint8Array|Array<number>} bytes UTF-8 byte array.
+            * @return {string} 16-bit Unicode string.
+            */
+            function utf8ByteArrayToString(bytes) {
+                var out = [], pos = 0, c = 0;
+                while (pos < bytes.length) {
+                    var c1 = bytes[pos++];
+                    if (c1 < 128) {
+                        out[c++] = String.fromCharCode(c1);
+                    } else if (c1 > 191 && c1 < 224) {
+                        var c2 = bytes[pos++];
+                        out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
+                    } else if (c1 > 239 && c1 < 365) {
+                        // Surrogate Pair
+                        var c2 = bytes[pos++];
+                        var c3 = bytes[pos++];
+                        var c4 = bytes[pos++];
+                        var u = ((c1 & 7) << 18 | (c2 & 63) << 12 | (c3 & 63) << 6 | c4 & 63) -
+                                        0x10000;
+                        out[c++] = String.fromCharCode(0xD800 + (u >> 10));
+                        out[c++] = String.fromCharCode(0xDC00 + (u & 1023));
+                    } else {
+                        var c2 = bytes[pos++];
+                        var c3 = bytes[pos++];
+                        out[c++] =
+                            String.fromCharCode((c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
+                    }
+                }
+                return out.join('');
+            }
+
+            document.addEventListener("DOMContentLoaded", function(event) {
+                const emojiSpans = document.getElementsByClassName("emoji-bytes");
+                for (let emojiSpan of emojiSpans) {
+                    emojiSpan.innerText = utf8ByteArrayToString(JSON.parse(emojiSpan.getAttribute("data-emoji-bytes")));
+                }
+            });
         </script>"""
     )
 
@@ -195,4 +240,5 @@ def export_request(request: CritiqueRequest):
     with _exporters_lock:
         if template.name not in _exporters:
             _exporters[template.name] = _MechanicalTurkCritiqueRequestExporter(template)
-    _exporters[template.name].export(request.fields)
+    encoded_fields = {field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields}
+    _exporters[template.name].export(encoded_fields)
diff --git a/src/helm/proxy/clients/mechanical_turk_critique_importer.py b/src/helm/proxy/clients/mechanical_turk_critique_importer.py
@@ -13,7 +13,7 @@
     CritiqueRequestResult,
 )
 from helm.common.hierarchical_logger import hlog
-
+from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters
 
 # A representation of fields that can be used as a dict key.
 _CritiqueRequestKey = Tuple[Tuple[str, str], ...]
@@ -119,4 +119,5 @@ def import_request_result(request: CritiqueRequest) -> Optional[CritiqueRequestR
         if template.name not in _importer:
             _importer[template.name] = _MechanicalTurkRequestImporter(template)
             _importer[template.name].initialize()
-    return _importer[template.name].import_request_result(request.fields)
+    encoded_fields = {field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields}
+    return _importer[template.name].import_request_result(encoded_fields)
diff --git a/src/helm/proxy/clients/mechanical_turk_utils.py b/src/helm/proxy/clients/mechanical_turk_utils.py
@@ -0,0 +1,49 @@
+import codecs
+import json
+import re
+import sys
+
+
+# Adapted from https://github.com/charman/mturk-emoji/blob/master/encode_emoji.py
+def replace_emoji_characters(s):
+    """Replace 4-byte characters with HTML spans with bytes as JSON array
+
+    This function takes a Unicode string containing 4-byte Unicode
+    characters, e.g. 😀, and replaces each 4-byte character with an
+    HTML span with the 4 bytes encoded as a JSON array, e.g.:
+
+      <span class='emoji-bytes' data-emoji-bytes='[240, 159, 152, 128]'></span>
+
+    Args:
+        s (Unicode string):
+    Returns:
+        Unicode string with all 4-byte Unicode characters in the source
+        string replaced with HTML spans
+    """
+    def _emoji_match_to_span(emoji_match):
+        """
+        Args:
+            emoji_match (MatchObject):
+
+        Returns:
+            Unicode string
+        """
+        bytes = codecs.encode(emoji_match.group(), 'utf-8')
+        bytes_as_json = json.dumps([b for b in bytearray(bytes)])
+        return u"<span class='emoji-bytes' data-emoji-bytes='%s'></span>" % \
+            bytes_as_json
+
+    # The procedure for stripping Emoji characters is based on this
+    # StackOverflow post:
+    #   http://stackoverflow.com/questions/12636489/python-convert-4-byte-char-to-avoid-mysql-error-incorrect-string-value
+    if sys.maxunicode == 1114111:
+        # Python was built with '--enable-unicode=ucs4'
+        highpoints = re.compile(u'[\U00010000-\U0010ffff]')
+    elif sys.maxunicode == 65535:
+        # Python was built with '--enable-unicode=ucs2'
+        highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
+    else:
+        raise UnicodeError(
+            "Unable to determine if Python was built using UCS-2 or UCS-4")
+
+    return highpoints.sub(_emoji_match_to_span, s)