Skip to content

Commit

Permalink
Use HTML entities instead
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai committed Aug 11, 2023
1 parent b19d582 commit 4601b35
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 54 deletions.
52 changes: 2 additions & 50 deletions src/helm/proxy/clients/mechanical_turk_critique_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _format_template_tags(raw_text: str) -> str:

def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
"""Render the Crowd HTML for the template."""
scripts_crowd_html = textwrap.dedent(
validation_crowd_html = textwrap.dedent(
"""\
<script>
// Validates that an option is selected for each radio group
Expand Down Expand Up @@ -54,54 +54,6 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
}
}
});
</script>
<script>
// Displays escaped emoji in because Mechanical turk does not support
// unescaped emoji. ("Unsupported characters found")
// Source: https://github.com/charman/mturk-emoji
/**
* utf8ByteArrayToString() copied from:
* https://github.com/google/closure-library/blob/e877b1eac410c0d842bcda118689759512e0e26f/closure/goog/crypt/crypt.js
*
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
* @param {Uint8Array|Array<number>} bytes UTF-8 byte array.
* @return {string} 16-bit Unicode string.
*/
function utf8ByteArrayToString(bytes) {
var out = [], pos = 0, c = 0;
while (pos < bytes.length) {
var c1 = bytes[pos++];
if (c1 < 128) {
out[c++] = String.fromCharCode(c1);
} else if (c1 > 191 && c1 < 224) {
var c2 = bytes[pos++];
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
} else if (c1 > 239 && c1 < 365) {
// Surrogate Pair
var c2 = bytes[pos++];
var c3 = bytes[pos++];
var c4 = bytes[pos++];
var u = ((c1 & 7) << 18 | (c2 & 63) << 12 | (c3 & 63) << 6 | c4 & 63) -
0x10000;
out[c++] = String.fromCharCode(0xD800 + (u >> 10));
out[c++] = String.fromCharCode(0xDC00 + (u & 1023));
} else {
var c2 = bytes[pos++];
var c3 = bytes[pos++];
out[c++] =
String.fromCharCode((c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
}
}
return out.join('');
}
document.addEventListener("DOMContentLoaded", function(event) {
const emojiSpans = document.getElementsByClassName("emoji-bytes");
for (let emojiSpan of emojiSpans) {
emojiSpan.innerText = utf8ByteArrayToString(JSON.parse(emojiSpan.getAttribute("data-emoji-bytes")));
}
});
</script>"""
)

Expand All @@ -115,7 +67,7 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
return textwrap.dedent(
f"""\
<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>
{_indent_to_level(scripts_crowd_html, 2)}
{_indent_to_level(validation_crowd_html, 2)}
<crowd-form answer-format="flatten-objects">
{_indent_to_level(instructions_crowd_html, 3)}
{_indent_to_level(divider_html, 3)}
Expand Down
5 changes: 1 addition & 4 deletions src/helm/proxy/clients/mechanical_turk_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import codecs
import json
import re
import sys
Expand Down Expand Up @@ -29,9 +28,7 @@ def _emoji_match_to_span(emoji_match):
Returns:
Unicode string
"""
bytes = codecs.encode(emoji_match.group(), "utf-8")
bytes_as_json = json.dumps([b for b in bytearray(bytes)])
return "<span class='emoji-bytes' data-emoji-bytes='%s'></span>" % bytes_as_json
return emoji_match.group().encode("ascii", "xmlcharrefreplace").decode()

# The procedure for stripping Emoji characters is based on this
# StackOverflow post:
Expand Down

0 comments on commit 4601b35

Please sign in to comment.