Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support emoji for MTurk import / export #1773

Merged
merged 8 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, num_respondents: int) -> None:
CritiqueQuestionTemplate(
name=_RELEVANCE_NAME,
question_type=QuestionType.MULTIPLE_CHOICE,
text="To what extend the summary include only important information from the source document? "
text="To what extent the summary include only important information from the source document? "
"(1 = not at all, 5 = very much)",
options=["1", "2", "3", "4", "5"],
),
Expand Down
33 changes: 20 additions & 13 deletions src/helm/proxy/clients/mechanical_turk_critique_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from helm.common.critique_request import CritiqueQuestionTemplate, CritiqueRequest, CritiqueTaskTemplate, QuestionType
from helm.common.general import ensure_directory_exists
from helm.common.hierarchical_logger import hlog
from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters


def _indent_to_level(text: str, level: int) -> str:
Expand Down Expand Up @@ -45,20 +46,22 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
return valid;
}

window.onload = function() {
document.addEventListener("DOMContentLoaded", function(event) {
document.querySelector('crowd-form').onsubmit = function(e) {
if (!validateForm()) {
alert("Please answer all the questions in order to submit.");
e.preventDefault();
}
}
}
});
</script>"""
)

instructions_crowd_html = f"<div>{_format_template_tags(task_template.instructions)}</div>"
instruction_question_break_html = "<br><br><h4>Please answer the questions below:</h4>"
questions_crowd_html = "<br>\n<br>\n".join(
instructions_crowd_html = (
f'<p style="white-space: pre-wrap;">{_format_template_tags(task_template.instructions)}</p>'
)
divider_html = "\n<hr>"
questions_crowd_html = "\n<hr>\n".join(
[_render_question_crowd_html(question) for question in task_template.questions]
)
return textwrap.dedent(
Expand All @@ -67,8 +70,9 @@ def _render_template_crowd_html(task_template: CritiqueTaskTemplate) -> str:
{_indent_to_level(validation_crowd_html, 2)}
<crowd-form answer-format="flatten-objects">
{_indent_to_level(instructions_crowd_html, 3)}
{_indent_to_level(instruction_question_break_html, 3)}
{_indent_to_level(divider_html, 3)}
{_indent_to_level(questions_crowd_html, 3)}
{_indent_to_level(divider_html, 3)}
</crowd-form>"""
)

Expand All @@ -91,16 +95,16 @@ def _render_question_crowd_html(question_template: CritiqueQuestionTemplate) ->
)
return textwrap.dedent(
f"""\
<div>
<p>{_format_template_tags(question_template.text)}</p>
{_indent_to_level(question_input_crowd_html, 3)}
</div>"""
<p style=\"white-space: pre-wrap;\">
{_format_template_tags(question_template.text)}
</p>
{_indent_to_level(question_input_crowd_html, 2)}"""
)


def _render_multiple_choice_options_crowd_html(name: str, options: List[str]) -> str:
"""Render the Crowd HTML for the options of a multiple-choice question."""
buttons_crowd_html = "<br>\n".join(
buttons_crowd_html = "\n<br>\n".join(
[
f"""<crowd-radio-button name="{name}.{index}">{_format_template_tags(option)}</crowd-radio-button>"""
for index, option in enumerate(options)
Expand All @@ -116,7 +120,7 @@ def _render_multiple_choice_options_crowd_html(name: str, options: List[str]) ->

def _render_checkbox_options_crowd_html(name: str, options: List[str]) -> str:
"""Render the Crowd HTML for the options of a checkbox question."""
return "<br>\n".join(
return "\n<br>\n".join(
[
f"""<crowd-checkbox name="{name}.{index}">{_format_template_tags(option)}</crowd-checkbox>"""
for index, option in enumerate(options)
Expand Down Expand Up @@ -195,4 +199,7 @@ def export_request(request: CritiqueRequest):
with _exporters_lock:
if template.name not in _exporters:
_exporters[template.name] = _MechanicalTurkCritiqueRequestExporter(template)
_exporters[template.name].export(request.fields)
encoded_fields = {
field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields.items()
}
_exporters[template.name].export(encoded_fields)
7 changes: 5 additions & 2 deletions src/helm/proxy/clients/mechanical_turk_critique_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
CritiqueRequestResult,
)
from helm.common.hierarchical_logger import hlog

from helm.proxy.clients.mechanical_turk_utils import replace_emoji_characters

# A representation of fields that can be used as a dict key.
_CritiqueRequestKey = Tuple[Tuple[str, str], ...]
Expand Down Expand Up @@ -119,4 +119,7 @@ def import_request_result(request: CritiqueRequest) -> Optional[CritiqueRequestR
if template.name not in _importer:
_importer[template.name] = _MechanicalTurkRequestImporter(template)
_importer[template.name].initialize()
return _importer[template.name].import_request_result(request.fields)
encoded_fields = {
field_name: replace_emoji_characters(field_value) for field_name, field_value in request.fields.items()
}
return _importer[template.name].import_request_result(encoded_fields)
45 changes: 45 additions & 0 deletions src/helm/proxy/clients/mechanical_turk_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import re
import sys


# Source: https://github.com/charman/mturk-emoji
def replace_emoji_characters(s):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add type hints

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

"""Replace 4-byte characters with HTML spans with bytes as JSON array

This function takes a Unicode string containing 4-byte Unicode
characters, e.g. 😀, and replaces each 4-byte character with an
HTML span with the 4 bytes encoded as a JSON array, e.g.:

<span class='emoji-bytes' data-emoji-bytes='[240, 159, 152, 128]'></span>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use double quotes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is out of date, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated.


Args:
s (Unicode string):
Returns:
Unicode string with all 4-byte Unicode characters in the source
string replaced with HTML spans
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Give an example?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added example.

"""

def _emoji_match_to_span(emoji_match):
"""
Args:
emoji_match (MatchObject):

Returns:
Unicode string
"""
return emoji_match.group().encode("ascii", "xmlcharrefreplace").decode()

# The procedure for stripping Emoji characters is based on this
# StackOverflow post:
# http://stackoverflow.com/questions/12636489/python-convert-4-byte-char-to-avoid-mysql-error-incorrect-string-value
if sys.maxunicode == 1114111:
# Python was built with '--enable-unicode=ucs4'
highpoints = re.compile("[\U00010000-\U0010ffff]")
elif sys.maxunicode == 65535:
# Python was built with '--enable-unicode=ucs2'
highpoints = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
else:
raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")

return highpoints.sub(_emoji_match_to_span, s)
Loading