-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Contextualized bias mitigation #5176
Changes from 47 commits
79c6c33
e23057c
fcc3d34
7d00910
668a513
91029ef
396b245
a8c22a1
ef6a062
2c873cb
d97a526
8460281
5a76922
85cb107
8e55f28
b42b73a
37d8e33
31b1d2c
a1f4f2a
36cebe3
88c083b
86081ee
ae592d8
2501b8c
f664dfb
595449d
dc4793f
0cdcf89
f254128
1be00c8
a6c9bf6
6624680
90a372e
c6a2dbf
7797659
bbfddd7
f2f3fc3
4e79de7
5dae69f
254676f
26d8dff
1ae5e99
e2cc38e
c34cf31
fdb9ea7
3efffd2
c47de58
2b8cf09
33d6267
ec53a05
4afb7f2
21bed9d
fefcbad
972ea60
b4011cb
4d7fffb
22a5964
bd727dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,3 +19,4 @@ | |
from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader | ||
from allennlp.data.dataset_readers.sharded_dataset_reader import ShardedDatasetReader | ||
from allennlp.data.dataset_readers.text_classification_json import TextClassificationJsonReader | ||
from allennlp.data.dataset_readers.snli import SnliReader | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is present in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it's used in the test, but the environment in which the test is run doesn't have |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
from typing import Dict, Optional | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was this just copied over from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it's used in the test, but the environment in which the test is run doesn't have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's fine to have some tests in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added the test under There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can remove this file now, right? |
||
import json | ||
import logging | ||
|
||
from overrides import overrides | ||
|
||
from allennlp.common.file_utils import cached_path | ||
from allennlp.data.dataset_readers.dataset_reader import DatasetReader | ||
from allennlp.data.fields import Field, TextField, LabelField, MetadataField | ||
from allennlp.data.instance import Instance | ||
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer | ||
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer, PretrainedTransformerTokenizer | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def maybe_collapse_label(label: str, collapse: bool): | ||
""" | ||
Helper function that optionally collapses the "contradiction" and "neutral" labels | ||
into "non-entailment". | ||
""" | ||
assert label in ["contradiction", "neutral", "entailment"] | ||
if collapse and label in ["contradiction", "neutral"]: | ||
return "non-entailment" | ||
return label | ||
|
||
|
||
@DatasetReader.register("snli_for_bias") | ||
class SnliReader(DatasetReader): | ||
""" | ||
Reads a file from the Stanford Natural Language Inference (SNLI) dataset. This data is | ||
formatted as jsonl, one json-formatted instance per line. The keys in the data are | ||
"gold_label", "sentence1", and "sentence2". We convert these keys into fields named "label", | ||
"premise" and "hypothesis", along with a metadata field containing the tokenized strings of the | ||
premise and hypothesis. | ||
Registered as a `DatasetReader` with name "snli". | ||
# Parameters | ||
tokenizer : `Tokenizer`, optional (default=`SpacyTokenizer()`) | ||
We use this `Tokenizer` for both the premise and the hypothesis. See :class:`Tokenizer`. | ||
token_indexers : `Dict[str, TokenIndexer]`, optional (default=`{"tokens": SingleIdTokenIndexer()}`) | ||
We similarly use this for both the premise and the hypothesis. See :class:`TokenIndexer`. | ||
combine_input_fields : `bool`, optional | ||
(default=`isinstance(tokenizer, PretrainedTransformerTokenizer)`) | ||
If False, represent the premise and the hypothesis as separate fields in the instance. | ||
If True, tokenize them together using `tokenizer.tokenize_sentence_pair()` | ||
and provide a single `tokens` field in the instance. | ||
collapse_labels : `bool`, optional (default=`False`) | ||
If `True`, the "neutral" and "contradiction" labels will be collapsed into "non-entailment"; | ||
"entailment" will be left unchanged. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
tokenizer: Optional[Tokenizer] = None, | ||
token_indexers: Dict[str, TokenIndexer] = None, | ||
combine_input_fields: Optional[bool] = None, | ||
collapse_labels: bool = False, | ||
**kwargs, | ||
) -> None: | ||
super().__init__( | ||
manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs | ||
) | ||
self._tokenizer = tokenizer or SpacyTokenizer() | ||
if isinstance(self._tokenizer, PretrainedTransformerTokenizer): | ||
assert not self._tokenizer._add_special_tokens | ||
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} | ||
if combine_input_fields is not None: | ||
self._combine_input_fields = combine_input_fields | ||
else: | ||
self._combine_input_fields = isinstance(self._tokenizer, PretrainedTransformerTokenizer) | ||
self.collapse_labels = collapse_labels | ||
|
||
@overrides | ||
def _read(self, file_path: str): | ||
# if `file_path` is a URL, redirect to the cache | ||
file_path = cached_path(file_path) | ||
with open(file_path, "r") as snli_file: | ||
example_iter = (json.loads(line) for line in snli_file) | ||
filtered_example_iter = ( | ||
example for example in example_iter if example.get("gold_label") != "-" | ||
) | ||
for example in self.shard_iterable(filtered_example_iter): | ||
label = example.get("gold_label") | ||
premise = example["sentence1"] | ||
hypothesis = example["sentence2"] | ||
yield self.text_to_instance(premise, hypothesis, label) | ||
|
||
@overrides | ||
def text_to_instance(self, premise, hypothesis, label: str = None) -> Instance: # type: ignore | ||
|
||
fields: Dict[str, Field] = {} | ||
premise = self._tokenizer.tokenize(premise) | ||
hypothesis = self._tokenizer.tokenize(hypothesis) | ||
|
||
if self._combine_input_fields: | ||
tokens = self._tokenizer.add_special_tokens(premise, hypothesis) | ||
fields["tokens"] = TextField(tokens) | ||
else: | ||
premise_tokens = self._tokenizer.add_special_tokens(premise) | ||
hypothesis_tokens = self._tokenizer.add_special_tokens(hypothesis) | ||
fields["premise"] = TextField(premise_tokens) | ||
fields["hypothesis"] = TextField(hypothesis_tokens) | ||
|
||
metadata = { | ||
"premise_tokens": [x.text for x in premise_tokens], | ||
"hypothesis_tokens": [x.text for x in hypothesis_tokens], | ||
} | ||
fields["metadata"] = MetadataField(metadata) | ||
|
||
if label: | ||
maybe_collapsed_label = maybe_collapse_label(label, self.collapse_labels) | ||
fields["label"] = LabelField(maybe_collapsed_label) | ||
|
||
return Instance(fields) | ||
|
||
@overrides | ||
def apply_token_indexers(self, instance: Instance): | ||
if "tokens" in instance.fields: | ||
instance.fields["tokens"]._token_indexers = self._token_indexers # type: ignore | ||
else: | ||
instance.fields["premise"]._token_indexers = self._token_indexers # type: ignore | ||
instance.fields["hypothesis"]._token_indexers = self._token_indexers # type: ignore |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we including this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, we are not!