Skip to content

Commit

Permalink
Support emoji for MTurk import / export (#1773)
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai authored Aug 11, 2023
1 parent f5ef0fc commit 46547d7
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, num_respondents: int) -> None:
CritiqueQuestionTemplate(
name=_RELEVANCE_NAME,
question_type=QuestionType.MULTIPLE_CHOICE,
text="To what extend the summary include only important information from the source document? "
text="To what extent the summary include only important information from the source document? "
"(1 = not at all, 5 = very much)",
options=["1", "2", "3", "4", "5"],
),
Expand Down
33 changes: 20 additions & 13 deletions src/helm/proxy/clients/mechanical_turk_critique_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from helm.common.critique_request import CritiqueQuestionTemplate, CritiqueRequest, CritiqueTaskTemplate, QuestionType
from helm.common.general import ensure_directory_exists
from helm.common.hierarchical_logger import hlog
from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters


def _indent_to_level(text: str, level: int) -> str:
Expand Down Expand Up @@ -45,20 +46,22 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
return valid;
}
window.onload = function() {
document.addEventListener("DOMContentLoaded", function(event) {
document.querySelector('crowd-form').onsubmit = function(e) {
if (!validateForm()) {
alert("Please answer all the questions in order to submit.");
e.preventDefault();
}
}
}
});
</script>"""
)

instructions_crowd_html = f"<div>{_format_template_tags(task_template.instructions)}</div>"
instruction_question_break_html = "<br><br><h4>Please answer the questions below:</h4>"
questions_crowd_html = "<br>\n<br>\n".join(
instructions_crowd_html = (
f'<p style="white-space: pre-wrap;">{_format_template_tags(task_template.instructions)}</p>'
)
divider_html = "\n<hr>"
questions_crowd_html = "\n<hr>\n".join(
[_render_question_crowd_html(question) for question in task_template.questions]
)
return textwrap.dedent(
Expand All @@ -67,8 +70,9 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
{_indent_to_level(validation_crowd_html, 2)}
<crowd-form answer-format="flatten-objects">
{_indent_to_level(instructions_crowd_html, 3)}
{_indent_to_level(instruction_question_break_html, 3)}
{_indent_to_level(divider_html, 3)}
{_indent_to_level(questions_crowd_html, 3)}
{_indent_to_level(divider_html, 3)}
</crowd-form>"""
)

Expand All @@ -91,16 +95,16 @@ def _render_question_crowd_html(question_template: CritiqueQuestionTemplate) ->
)
return textwrap.dedent(
f"""\
<div>
<p>{_format_template_tags(question_template.text)}</p>
{_indent_to_level(question_input_crowd_html, 3)}
</div>"""
<p style=\"white-space: pre-wrap;\">
{_format_template_tags(question_template.text)}
</p>
{_indent_to_level(question_input_crowd_html, 2)}"""
)


def _render_multiple_choice_options_crowd_html(name: str, options: List[str]) -> str:
"""Render the Crowd HTML for the options of a multiple-choice question."""
buttons_crowd_html = "<br>\n".join(
buttons_crowd_html = "\n<br>\n".join(
[
f"""<crowd-radio-button name="{name}.{index}">{_format_template_tags(option)}</crowd-radio-button>"""
for index, option in enumerate(options)
Expand All @@ -116,7 +120,7 @@ def _render_multiple_choice_options_crowd_html(name: str, options: List[str]) ->

def _render_checkbox_options_crowd_html(name: str, options: List[str]) -> str:
"""Render the Crowd HTML for the options of a checkbox question."""
return "<br>\n".join(
return "\n<br>\n".join(
[
f"""<crowd-checkbox name="{name}.{index}">{_format_template_tags(option)}</crowd-checkbox>"""
for index, option in enumerate(options)
Expand Down Expand Up @@ -195,4 +199,7 @@ def export_request(request: CritiqueRequest):
with _exporters_lock:
if template.name not in _exporters:
_exporters[template.name] = _MechanicalTurkCritiqueRequestExporter(template)
_exporters[template.name].export(request.fields)
encoded_fields = {
field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields.items()
}
_exporters[template.name].export(encoded_fields)
7 changes: 5 additions & 2 deletions src/helm/proxy/clients/mechanical_turk_critique_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
CritiqueRequestResult,
)
from helm.common.hierarchical_logger import hlog

from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters

# A representation of fields that can be used as a dict key.
_CritiqueRequestKey = Tuple[Tuple[str, str], ...]
Expand Down Expand Up @@ -119,4 +119,7 @@ def import_request_result(request: CritiqueRequest) -> Optional[CritiqueRequestR
if template.name not in _importer:
_importer[template.name] = _MechanicalTurkRequestImporter(template)
_importer[template.name].initialize()
return _importer[template.name].import_request_result(request.fields)
encoded_fields = {
field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields.items()
}
return _importer[template.name].import_request_result(encoded_fields)
45 changes: 45 additions & 0 deletions src/helm/proxy/clients/mechanical_turk_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import re
from re import Match
import sys


# Source: https://github.com/charman/mturk-emoji
def replace_emoji_characters(s: str) -> str:
"""Replace 4-byte characters with HTML spans with bytes as JSON array
This function takes a Unicode string that may contain 4-byte Unicode
characters, e.g. "hi😀😀", and replaces each 4-byte character with an
HTML span with the 4 bytes encoded as a HTML entity,
e.g. "hi&#128512;&#128512;"
Args:
s (Unicode string): String that may contain emojis e.g. "hi😀😀"
Returns:
Unicode string with all 4-byte Unicode characters in the source
string replaced with HTML entities e.g. "hi&#128512;&#128512;"
"""

def _emoji_match_to_span(emoji_match: Match) -> str:
"""
Args:
emoji_match (Match): match containing a single group
with a single emoji e.g. "😀"
Returns:
Unicode string with the emoji encoded a HTML entity e.g. "&#128512;"
"""
return emoji_match.group().encode("ascii", "xmlcharrefreplace").decode()

# The procedure for stripping Emoji characters is based on this
# StackOverflow post:
# http://stackoverflow.com/questions/12636489/python-convert-4-byte-char-to-avoid-mysql-error-incorrect-string-value
if sys.maxunicode == 1114111:
# Python was built with '--enable-unicode=ucs4'
highpoints = re.compile("[\U00010000-\U0010ffff]")
elif sys.maxunicode == 65535:
# Python was built with '--enable-unicode=ucs2'
highpoints = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
else:
raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")

return highpoints.sub(_emoji_match_to_span, s)

0 comments on commit 46547d7

Please sign in to comment.