Skip to content

Commit

Permalink
Support emoji in Mechanical Turk
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai committed Aug 7, 2023
1 parent 02d46d3 commit 4c7a7d5
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, num_respondents: int) -> None:
CritiqueQuestionTemplate(
name=_RELEVANCE_NAME,
question_type=QuestionType.MULTIPLE_CHOICE,
text="To what extend the summary include only important information from the source document? "
text="To what extent the summary include only important information from the source document? "
"(1 = not at all, 5 = very much)",
options=["1", "2", "3", "4", "5"],
),
Expand Down
37 changes: 23 additions & 14 deletions src/helm/proxy/clients/mechanical_turk_critique_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _format_template_tags(raw_text: str) -> str:

def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
"""Render the Crowd HTML for the template."""
validation_crowd_html = textwrap.dedent(
scripts_crowd_html = textwrap.dedent(
"""\
<script>
// Validates that an option is selected for each radio group
Expand All @@ -53,9 +53,13 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
e.preventDefault();
}
}
}
});
</script>
<script>
// Displays escaped emoji in because Mechanical turk does not support
// unescaped emoji. ("Unsupported characters found")
// Source: https://github.com/charman/mturk-emoji
/**
* utf8ByteArrayToString() copied from:
* https://github.com/google/closure-library/blob/e877b1eac410c0d842bcda118689759512e0e26f/closure/goog/crypt/crypt.js
Expand Down Expand Up @@ -101,19 +105,22 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
</script>"""
)

instructions_crowd_html = f"<div>{_format_template_tags(task_template.instructions)}</div>"
instruction_question_break_html = "<br><br><h4>Please answer the questions below:</h4>"
questions_crowd_html = "<br>\n<br>\n".join(
instructions_crowd_html = (
f'<p style="white-space: pre-wrap;">{_format_template_tags(task_template.instructions)}</p>'
)
divider_html = "\n<hr>"
questions_crowd_html = "\n<hr>\n".join(
[_render_question_crowd_html(question) for question in task_template.questions]
)
return textwrap.dedent(
f"""\
<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>
{_indent_to_level(validation_crowd_html, 2)}
{_indent_to_level(scripts_crowd_html, 2)}
<crowd-form answer-format="flatten-objects">
{_indent_to_level(instructions_crowd_html, 3)}
{_indent_to_level(instruction_question_break_html, 3)}
{_indent_to_level(divider_html, 3)}
{_indent_to_level(questions_crowd_html, 3)}
{_indent_to_level(divider_html, 3)}
</crowd-form>"""
)

Expand All @@ -136,16 +143,16 @@ def _render_question_crowd_html(question_template: CritiqueQuestionTemplate) ->
)
return textwrap.dedent(
f"""\
<div>
<p>{_format_template_tags(question_template.text)}</p>
{_indent_to_level(question_input_crowd_html, 3)}
</div>"""
<p style=\"white-space: pre-wrap;\">
{_format_template_tags(question_template.text)}
</p>
{_indent_to_level(question_input_crowd_html, 2)}"""
)


def _render_multiple_choice_options_crowd_html(name: str, options: List[str]) -> str:
"""Render the Crowd HTML for the options of a multiple-choice question."""
buttons_crowd_html = "<br>\n".join(
buttons_crowd_html = "\n<br>\n".join(
[
f"""<crowd-radio-button name="{name}.{index}">{_format_template_tags(option)}</crowd-radio-button>"""
for index, option in enumerate(options)
Expand All @@ -161,7 +168,7 @@ def _render_multiple_choice_options_crowd_html(name: str, options: List[str]) ->

def _render_checkbox_options_crowd_html(name: str, options: List[str]) -> str:
"""Render the Crowd HTML for the options of a checkbox question."""
return "<br>\n".join(
return "\n<br>\n".join(
[
f"""<crowd-checkbox name="{name}.{index}">{_format_template_tags(option)}</crowd-checkbox>"""
for index, option in enumerate(options)
Expand Down Expand Up @@ -240,5 +247,7 @@ def export_request(request: CritiqueRequest):
with _exporters_lock:
if template.name not in _exporters:
_exporters[template.name] = _MechanicalTurkCritiqueRequestExporter(template)
encoded_fields = {field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields}
encoded_fields = {
field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields.items()
}
_exporters[template.name].export(encoded_fields)
4 changes: 3 additions & 1 deletion src/helm/proxy/clients/mechanical_turk_critique_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,7 @@ def import_request_result(request: CritiqueRequest) -> Optional[CritiqueRequestR
if template.name not in _importer:
_importer[template.name] = _MechanicalTurkRequestImporter(template)
_importer[template.name].initialize()
encoded_fields = {field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields}
encoded_fields = {
field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields.items()
}
return _importer[template.name].import_request_result(encoded_fields)
15 changes: 7 additions & 8 deletions src/helm/proxy/clients/mechanical_turk_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys


# Adapted from https://github.com/charman/mturk-emoji/blob/master/encode_emoji.py
# Source: https://github.com/charman/mturk-emoji
def replace_emoji_characters(s):
"""Replace 4-byte characters with HTML spans with bytes as JSON array
Expand All @@ -20,6 +20,7 @@ def replace_emoji_characters(s):
Unicode string with all 4-byte Unicode characters in the source
string replaced with HTML spans
"""

def _emoji_match_to_span(emoji_match):
"""
Args:
Expand All @@ -28,22 +29,20 @@ def _emoji_match_to_span(emoji_match):
Returns:
Unicode string
"""
bytes = codecs.encode(emoji_match.group(), 'utf-8')
bytes = codecs.encode(emoji_match.group(), "utf-8")
bytes_as_json = json.dumps([b for b in bytearray(bytes)])
return u"<span class='emoji-bytes' data-emoji-bytes='%s'></span>" % \
bytes_as_json
return "<span class='emoji-bytes' data-emoji-bytes='%s'></span>" % bytes_as_json

# The procedure for stripping Emoji characters is based on this
# StackOverflow post:
# http://stackoverflow.com/questions/12636489/python-convert-4-byte-char-to-avoid-mysql-error-incorrect-string-value
if sys.maxunicode == 1114111:
# Python was built with '--enable-unicode=ucs4'
highpoints = re.compile(u'[\U00010000-\U0010ffff]')
highpoints = re.compile("[\U00010000-\U0010ffff]")
elif sys.maxunicode == 65535:
# Python was built with '--enable-unicode=ucs2'
highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
highpoints = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
else:
raise UnicodeError(
"Unable to determine if Python was built using UCS-2 or UCS-4")
raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")

return highpoints.sub(_emoji_match_to_span, s)

0 comments on commit 4c7a7d5

Please sign in to comment.