Skip to content

Commit

Permalink
Fix mturk emoji
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai committed Aug 4, 2023
1 parent d52b890 commit 02d46d3
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def evaluate_generation(
if len(request_state.result.completions) != 1:
raise ValueError("SummarizationCritiqueMetric only supports a single generation per instance")
summary = request_state.result.completions[0].text
summary += "😀"
request = CritiqueRequest(
self._template, fields={"original_text": request_state.instance.input.text, "summary": summary}
)
Expand Down
50 changes: 48 additions & 2 deletions src/helm/proxy/clients/mechanical_turk_critique_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from helm.common.critique_request import CritiqueQuestionTemplate, CritiqueRequest, CritiqueTaskTemplate, QuestionType
from helm.common.general import ensure_directory_exists
from helm.common.hierarchical_logger import hlog
from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters


def _indent_to_level(text: str, level: int) -> str:
Expand Down Expand Up @@ -45,14 +46,58 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
return valid;
}
window.onload = function() {
document.addEventListener("DOMContentLoaded", function(event) {
document.querySelector('crowd-form').onsubmit = function(e) {
if (!validateForm()) {
alert("Please answer all the questions in order to submit.");
e.preventDefault();
}
}
}
</script>
<script>
/**
* utf8ByteArrayToString() copied from:
* https://github.com/google/closure-library/blob/e877b1eac410c0d842bcda118689759512e0e26f/closure/goog/crypt/crypt.js
*
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
* @param {Uint8Array|Array<number>} bytes UTF-8 byte array.
* @return {string} 16-bit Unicode string.
*/
function utf8ByteArrayToString(bytes) {
var out = [], pos = 0, c = 0;
while (pos < bytes.length) {
var c1 = bytes[pos++];
if (c1 < 128) {
out[c++] = String.fromCharCode(c1);
} else if (c1 > 191 && c1 < 224) {
var c2 = bytes[pos++];
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
} else if (c1 > 239 && c1 < 365) {
// Surrogate Pair
var c2 = bytes[pos++];
var c3 = bytes[pos++];
var c4 = bytes[pos++];
var u = ((c1 & 7) << 18 | (c2 & 63) << 12 | (c3 & 63) << 6 | c4 & 63) -
0x10000;
out[c++] = String.fromCharCode(0xD800 + (u >> 10));
out[c++] = String.fromCharCode(0xDC00 + (u & 1023));
} else {
var c2 = bytes[pos++];
var c3 = bytes[pos++];
out[c++] =
String.fromCharCode((c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
}
}
return out.join('');
}
document.addEventListener("DOMContentLoaded", function(event) {
const emojiSpans = document.getElementsByClassName("emoji-bytes");
for (let emojiSpan of emojiSpans) {
emojiSpan.innerText = utf8ByteArrayToString(JSON.parse(emojiSpan.getAttribute("data-emoji-bytes")));
}
});
</script>"""
)

Expand Down Expand Up @@ -195,4 +240,5 @@ def export_request(request: CritiqueRequest):
with _exporters_lock:
if template.name not in _exporters:
_exporters[template.name] = _MechanicalTurkCritiqueRequestExporter(template)
_exporters[template.name].export(request.fields)
encoded_fields = {field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields}
_exporters[template.name].export(encoded_fields)
5 changes: 3 additions & 2 deletions src/helm/proxy/clients/mechanical_turk_critique_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
CritiqueRequestResult,
)
from helm.common.hierarchical_logger import hlog

from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters

# A representation of fields that can be used as a dict key.
_CritiqueRequestKey = Tuple[Tuple[str, str], ...]
Expand Down Expand Up @@ -119,4 +119,5 @@ def import_request_result(request: CritiqueRequest) -> Optional[CritiqueRequestR
if template.name not in _importer:
_importer[template.name] = _MechanicalTurkRequestImporter(template)
_importer[template.name].initialize()
return _importer[template.name].import_request_result(request.fields)
encoded_fields = {field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields}
return _importer[template.name].import_request_result(encoded_fields)
49 changes: 49 additions & 0 deletions src/helm/proxy/clients/mechanical_turk_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import codecs
import json
import re
import sys


# Adapted from https://github.com/charman/mturk-emoji/blob/master/encode_emoji.py
def replace_emoji_characters(s):
"""Replace 4-byte characters with HTML spans with bytes as JSON array
This function takes a Unicode string containing 4-byte Unicode
characters, e.g. 😀, and replaces each 4-byte character with an
HTML span with the 4 bytes encoded as a JSON array, e.g.:
<span class='emoji-bytes' data-emoji-bytes='[240, 159, 152, 128]'></span>
Args:
s (Unicode string):
Returns:
Unicode string with all 4-byte Unicode characters in the source
string replaced with HTML spans
"""
def _emoji_match_to_span(emoji_match):
"""
Args:
emoji_match (MatchObject):
Returns:
Unicode string
"""
bytes = codecs.encode(emoji_match.group(), 'utf-8')
bytes_as_json = json.dumps([b for b in bytearray(bytes)])
return u"<span class='emoji-bytes' data-emoji-bytes='%s'></span>" % \
bytes_as_json

# The procedure for stripping Emoji characters is based on this
# StackOverflow post:
# http://stackoverflow.com/questions/12636489/python-convert-4-byte-char-to-avoid-mysql-error-incorrect-string-value
if sys.maxunicode == 1114111:
# Python was built with '--enable-unicode=ucs4'
highpoints = re.compile(u'[\U00010000-\U0010ffff]')
elif sys.maxunicode == 65535:
# Python was built with '--enable-unicode=ucs2'
highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
else:
raise UnicodeError(
"Unable to determine if Python was built using UCS-2 or UCS-4")

return highpoints.sub(_emoji_match_to_span, s)

0 comments on commit 02d46d3

Please sign in to comment.