diff --git a/examples/sentence_similarity/01-prep-data/snli.ipynb b/examples/sentence_similarity/01-prep-data/snli.ipynb
index 847bc59c3..c886b1f10 100644
--- a/examples/sentence_similarity/01-prep-data/snli.ipynb
+++ b/examples/sentence_similarity/01-prep-data/snli.ipynb
@@ -38,7 +38,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n"
+ "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n",
+ "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n"
]
}
],
@@ -90,16 +91,13 @@
"# defaults to txt\n",
"train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
"\n",
- "#load dataframe from jsonl file format\n",
- "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\", file_type=\"jsonl\")\n",
- "\n",
- "#specify txt format \n",
- "test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\", file_type=\"txt\")\n"
+ "# or, load dataframe from jsonl\n",
+ "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\", file_type=\"jsonl\")"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -280,7 +278,7 @@
"4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN "
]
},
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -289,422 +287,44 @@
"train.head()"
]
},
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " annotator_labels | \n",
- " captionID | \n",
- " gold_label | \n",
- " pairID | \n",
- " sentence1 | \n",
- " sentence1_binary_parse | \n",
- " sentence1_parse | \n",
- " sentence2 | \n",
- " sentence2_binary_parse | \n",
- " sentence2_parse | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " [neutral, entailment, neutral, neutral, neutral] | \n",
- " 4705552913.jpg#2 | \n",
- " neutral | \n",
- " 4705552913.jpg#2r1n | \n",
- " Two women are embracing while holding to go pa... | \n",
- " ( ( Two women ) ( ( are ( embracing ( while ( ... | \n",
- " (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... | \n",
- " The sisters are hugging goodbye while holding ... | \n",
- " ( ( The sisters ) ( ( are ( ( hugging goodbye ... | \n",
- " (ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " [entailment, entailment, entailment, entailmen... | \n",
- " 4705552913.jpg#2 | \n",
- " entailment | \n",
- " 4705552913.jpg#2r1e | \n",
- " Two women are embracing while holding to go pa... | \n",
- " ( ( Two women ) ( ( are ( embracing ( while ( ... | \n",
- " (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... | \n",
- " Two woman are holding packages. | \n",
- " ( ( Two woman ) ( ( are ( holding packages ) )... | \n",
- " (ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are... | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " [contradiction, contradiction, contradiction, ... | \n",
- " 4705552913.jpg#2 | \n",
- " contradiction | \n",
- " 4705552913.jpg#2r1c | \n",
- " Two women are embracing while holding to go pa... | \n",
- " ( ( Two women ) ( ( are ( embracing ( while ( ... | \n",
- " (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... | \n",
- " The men are fighting outside a deli. | \n",
- " ( ( The men ) ( ( are ( fighting ( outside ( a... | \n",
- " (ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)... | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " [entailment, entailment, entailment, entailmen... | \n",
- " 2407214681.jpg#0 | \n",
- " entailment | \n",
- " 2407214681.jpg#0r1e | \n",
- " Two young children in blue jerseys, one with t... | \n",
- " ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... | \n",
- " (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... | \n",
- " Two kids in numbered jerseys wash their hands. | \n",
- " ( ( ( Two kids ) ( in ( numbered jerseys ) ) )... | \n",
- " (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " [neutral, neutral, neutral, entailment, entail... | \n",
- " 2407214681.jpg#0 | \n",
- " neutral | \n",
- " 2407214681.jpg#0r1n | \n",
- " Two young children in blue jerseys, one with t... | \n",
- " ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... | \n",
- " (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... | \n",
- " Two kids at a ballgame wash their hands. | \n",
- " ( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w... | \n",
- " (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " annotator_labels captionID \\\n",
- "0 [neutral, entailment, neutral, neutral, neutral] 4705552913.jpg#2 \n",
- "1 [entailment, entailment, entailment, entailmen... 4705552913.jpg#2 \n",
- "2 [contradiction, contradiction, contradiction, ... 4705552913.jpg#2 \n",
- "3 [entailment, entailment, entailment, entailmen... 2407214681.jpg#0 \n",
- "4 [neutral, neutral, neutral, entailment, entail... 2407214681.jpg#0 \n",
- "\n",
- " gold_label pairID \\\n",
- "0 neutral 4705552913.jpg#2r1n \n",
- "1 entailment 4705552913.jpg#2r1e \n",
- "2 contradiction 4705552913.jpg#2r1c \n",
- "3 entailment 2407214681.jpg#0r1e \n",
- "4 neutral 2407214681.jpg#0r1n \n",
- "\n",
- " sentence1 \\\n",
- "0 Two women are embracing while holding to go pa... \n",
- "1 Two women are embracing while holding to go pa... \n",
- "2 Two women are embracing while holding to go pa... \n",
- "3 Two young children in blue jerseys, one with t... \n",
- "4 Two young children in blue jerseys, one with t... \n",
- "\n",
- " sentence1_binary_parse \\\n",
- "0 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
- "1 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
- "2 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
- "3 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
- "4 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
- "\n",
- " sentence1_parse \\\n",
- "0 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
- "1 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
- "2 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
- "3 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
- "4 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
- "\n",
- " sentence2 \\\n",
- "0 The sisters are hugging goodbye while holding ... \n",
- "1 Two woman are holding packages. \n",
- "2 The men are fighting outside a deli. \n",
- "3 Two kids in numbered jerseys wash their hands. \n",
- "4 Two kids at a ballgame wash their hands. \n",
- "\n",
- " sentence2_binary_parse \\\n",
- "0 ( ( The sisters ) ( ( are ( ( hugging goodbye ... \n",
- "1 ( ( Two woman ) ( ( are ( holding packages ) )... \n",
- "2 ( ( The men ) ( ( are ( fighting ( outside ( a... \n",
- "3 ( ( ( Two kids ) ( in ( numbered jerseys ) ) )... \n",
- "4 ( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w... \n",
- "\n",
- " sentence2_parse \n",
- "0 (ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ... \n",
- "1 (ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are... \n",
- "2 (ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)... \n",
- "3 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... \n",
- "4 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... "
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dev.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " gold_label | \n",
- " sentence1_binary_parse | \n",
- " sentence2_binary_parse | \n",
- " sentence1_parse | \n",
- " sentence2_parse | \n",
- " sentence1 | \n",
- " sentence2 | \n",
- " captionID | \n",
- " pairID | \n",
- " label1 | \n",
- " label2 | \n",
- " label3 | \n",
- " label4 | \n",
- " label5 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " neutral | \n",
- " ( ( This ( church choir ) ) ( ( ( sings ( to (... | \n",
- " ( ( The church ) ( ( has ( cracks ( in ( the c... | \n",
- " (ROOT (S (NP (DT This) (NN church) (NN choir))... | \n",
- " (ROOT (S (NP (DT The) (NN church)) (VP (VBZ ha... | \n",
- " This church choir sings to the masses as they ... | \n",
- " The church has cracks in the ceiling. | \n",
- " 2677109430.jpg#1 | \n",
- " 2677109430.jpg#1r1n | \n",
- " neutral | \n",
- " contradiction | \n",
- " contradiction | \n",
- " neutral | \n",
- " neutral | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " entailment | \n",
- " ( ( This ( church choir ) ) ( ( ( sings ( to (... | \n",
- " ( ( The church ) ( ( is ( filled ( with song )... | \n",
- " (ROOT (S (NP (DT This) (NN church) (NN choir))... | \n",
- " (ROOT (S (NP (DT The) (NN church)) (VP (VBZ is... | \n",
- " This church choir sings to the masses as they ... | \n",
- " The church is filled with song. | \n",
- " 2677109430.jpg#1 | \n",
- " 2677109430.jpg#1r1e | \n",
- " entailment | \n",
- " entailment | \n",
- " entailment | \n",
- " neutral | \n",
- " entailment | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " contradiction | \n",
- " ( ( This ( church choir ) ) ( ( ( sings ( to (... | \n",
- " ( ( ( A choir ) ( singing ( at ( a ( baseball ... | \n",
- " (ROOT (S (NP (DT This) (NN church) (NN choir))... | \n",
- " (ROOT (NP (NP (DT A) (NN choir)) (VP (VBG sing... | \n",
- " This church choir sings to the masses as they ... | \n",
- " A choir singing at a baseball game. | \n",
- " 2677109430.jpg#1 | \n",
- " 2677109430.jpg#1r1c | \n",
- " contradiction | \n",
- " contradiction | \n",
- " contradiction | \n",
- " contradiction | \n",
- " contradiction | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " neutral | \n",
- " ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... | \n",
- " ( ( The woman ) ( ( is young ) . ) ) | \n",
- " (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... | \n",
- " (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... | \n",
- " A woman with a green headscarf, blue shirt and... | \n",
- " The woman is young. | \n",
- " 6160193920.jpg#4 | \n",
- " 6160193920.jpg#4r1n | \n",
- " neutral | \n",
- " neutral | \n",
- " neutral | \n",
- " neutral | \n",
- " neutral | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " entailment | \n",
- " ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... | \n",
- " ( ( The woman ) ( ( is ( very happy ) ) . ) ) | \n",
- " (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... | \n",
- " (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... | \n",
- " A woman with a green headscarf, blue shirt and... | \n",
- " The woman is very happy. | \n",
- " 6160193920.jpg#4 | \n",
- " 6160193920.jpg#4r1e | \n",
- " entailment | \n",
- " entailment | \n",
- " contradiction | \n",
- " entailment | \n",
- " neutral | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " gold_label sentence1_binary_parse \\\n",
- "0 neutral ( ( This ( church choir ) ) ( ( ( sings ( to (... \n",
- "1 entailment ( ( This ( church choir ) ) ( ( ( sings ( to (... \n",
- "2 contradiction ( ( This ( church choir ) ) ( ( ( sings ( to (... \n",
- "3 neutral ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... \n",
- "4 entailment ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... \n",
- "\n",
- " sentence2_binary_parse \\\n",
- "0 ( ( The church ) ( ( has ( cracks ( in ( the c... \n",
- "1 ( ( The church ) ( ( is ( filled ( with song )... \n",
- "2 ( ( ( A choir ) ( singing ( at ( a ( baseball ... \n",
- "3 ( ( The woman ) ( ( is young ) . ) ) \n",
- "4 ( ( The woman ) ( ( is ( very happy ) ) . ) ) \n",
- "\n",
- " sentence1_parse \\\n",
- "0 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n",
- "1 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n",
- "2 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n",
- "3 (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... \n",
- "4 (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... \n",
- "\n",
- " sentence2_parse \\\n",
- "0 (ROOT (S (NP (DT The) (NN church)) (VP (VBZ ha... \n",
- "1 (ROOT (S (NP (DT The) (NN church)) (VP (VBZ is... \n",
- "2 (ROOT (NP (NP (DT A) (NN choir)) (VP (VBG sing... \n",
- "3 (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... \n",
- "4 (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... \n",
- "\n",
- " sentence1 \\\n",
- "0 This church choir sings to the masses as they ... \n",
- "1 This church choir sings to the masses as they ... \n",
- "2 This church choir sings to the masses as they ... \n",
- "3 A woman with a green headscarf, blue shirt and... \n",
- "4 A woman with a green headscarf, blue shirt and... \n",
- "\n",
- " sentence2 captionID \\\n",
- "0 The church has cracks in the ceiling. 2677109430.jpg#1 \n",
- "1 The church is filled with song. 2677109430.jpg#1 \n",
- "2 A choir singing at a baseball game. 2677109430.jpg#1 \n",
- "3 The woman is young. 6160193920.jpg#4 \n",
- "4 The woman is very happy. 6160193920.jpg#4 \n",
- "\n",
- " pairID label1 label2 label3 \\\n",
- "0 2677109430.jpg#1r1n neutral contradiction contradiction \n",
- "1 2677109430.jpg#1r1e entailment entailment entailment \n",
- "2 2677109430.jpg#1r1c contradiction contradiction contradiction \n",
- "3 6160193920.jpg#4r1n neutral neutral neutral \n",
- "4 6160193920.jpg#4r1e entailment entailment contradiction \n",
- "\n",
- " label4 label5 \n",
- "0 neutral neutral \n",
- "1 neutral entailment \n",
- "2 contradiction contradiction \n",
- "3 neutral neutral \n",
- "4 entailment neutral "
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "test.head()"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 02 Tokenize\n",
"\n",
- "We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens.\n",
+ "Now that we've loaded the data into a pandas.DataFrame, we can tokenize the sentences.\n",
"We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2."
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def clean(df, file_split):\n",
" src_file_path = os.path.join(BASE_DATA_PATH, \"raw/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n",
- " return snli.clean_snli(src_file_path).dropna() # drop rows with any NaN vals"
+ " if not os.path.exists(os.path.join(BASE_DATA_PATH, \"clean/snli_1.0\")):\n",
+ " os.makedirs(os.path.join(BASE_DATA_PATH, \"clean/snli_1.0\"))\n",
+ " dest_file_path = os.path.join(BASE_DATA_PATH, \"clean/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n",
+ " clean_df = snli.clean_snli(src_file_path).dropna() # drop rows with any NaN vals\n",
+ " clean_df.to_csv(dest_file_path)\n",
+ " return clean_df"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- "train = clean(train, 'train')\n",
- "dev = clean(dev, 'dev')\n",
- "test = clean(test, 'test')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Glimpse of the data"
+ "train = clean(train, 'train')"
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -784,7 +404,7 @@
"4 There are children present "
]
},
- "execution_count": 11,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -793,225 +413,34 @@
"train.head()"
]
},
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " score | \n",
- " sentence1 | \n",
- " sentence2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " neutral | \n",
- " Two women are embracing while holding to go pa... | \n",
- " The sisters are hugging goodbye while holding ... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " entailment | \n",
- " Two women are embracing while holding to go pa... | \n",
- " Two woman are holding packages. | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " contradiction | \n",
- " Two women are embracing while holding to go pa... | \n",
- " The men are fighting outside a deli. | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " entailment | \n",
- " Two young children in blue jerseys, one with t... | \n",
- " Two kids in numbered jerseys wash their hands. | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " neutral | \n",
- " Two young children in blue jerseys, one with t... | \n",
- " Two kids at a ballgame wash their hands. | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " score sentence1 \\\n",
- "0 neutral Two women are embracing while holding to go pa... \n",
- "1 entailment Two women are embracing while holding to go pa... \n",
- "2 contradiction Two women are embracing while holding to go pa... \n",
- "3 entailment Two young children in blue jerseys, one with t... \n",
- "4 neutral Two young children in blue jerseys, one with t... \n",
- "\n",
- " sentence2 \n",
- "0 The sisters are hugging goodbye while holding ... \n",
- "1 Two woman are holding packages. \n",
- "2 The men are fighting outside a deli. \n",
- "3 Two kids in numbered jerseys wash their hands. \n",
- "4 Two kids at a ballgame wash their hands. "
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dev.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " score | \n",
- " sentence1 | \n",
- " sentence2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " neutral | \n",
- " This church choir sings to the masses as they ... | \n",
- " The church has cracks in the ceiling. | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " entailment | \n",
- " This church choir sings to the masses as they ... | \n",
- " The church is filled with song. | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " contradiction | \n",
- " This church choir sings to the masses as they ... | \n",
- " A choir singing at a baseball game. | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " neutral | \n",
- " A woman with a green headscarf, blue shirt and... | \n",
- " The woman is young. | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " entailment | \n",
- " A woman with a green headscarf, blue shirt and... | \n",
- " The woman is very happy. | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " score sentence1 \\\n",
- "0 neutral This church choir sings to the masses as they ... \n",
- "1 entailment This church choir sings to the masses as they ... \n",
- "2 contradiction This church choir sings to the masses as they ... \n",
- "3 neutral A woman with a green headscarf, blue shirt and... \n",
- "4 entailment A woman with a green headscarf, blue shirt and... \n",
- "\n",
- " sentence2 \n",
- "0 The church has cracks in the ceiling. \n",
- "1 The church is filled with song. \n",
- "2 A choir singing at a baseball game. \n",
- "3 The woman is young. \n",
- "4 The woman is very happy. "
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "test.head()"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization."
+ "Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK](https://www.nltk.org/) library for tokenization."
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "[nltk_data] Downloading package punkt to\n",
- "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n",
- "[nltk_data] Package punkt is already up-to-date!\n",
- "[nltk_data] Downloading package punkt to\n",
- "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n",
- "[nltk_data] Package punkt is already up-to-date!\n",
- "[nltk_data] Downloading package punkt to\n",
- "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
- "train_tok = to_nltk_tokens(to_lowercase(train))\n",
- "dev_tok = to_nltk_tokens(to_lowercase(dev))\n",
- "test_tok = to_nltk_tokens(to_lowercase(test))"
+ "train_tok = to_nltk_tokens(to_lowercase(train))"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -1046,42 +475,42 @@
" \n",
" 0 | \n",
" neutral | \n",
- " two women are embracing while holding to go pa... | \n",
- " the sisters are hugging goodbye while holding ... | \n",
- " [two, women, are, embracing, while, holding, t... | \n",
- " [the, sisters, are, hugging, goodbye, while, h... | \n",
+ " a person on a horse jumps over a broken down a... | \n",
+ " a person is training his horse for a competition. | \n",
+ " [a, person, on, a, horse, jumps, over, a, brok... | \n",
+ " [a, person, is, training, his, horse, for, a, ... | \n",
"
\n",
" \n",
" 1 | \n",
- " entailment | \n",
- " two women are embracing while holding to go pa... | \n",
- " two woman are holding packages. | \n",
- " [two, women, are, embracing, while, holding, t... | \n",
- " [two, woman, are, holding, packages, .] | \n",
+ " contradiction | \n",
+ " a person on a horse jumps over a broken down a... | \n",
+ " a person is at a diner, ordering an omelette. | \n",
+ " [a, person, on, a, horse, jumps, over, a, brok... | \n",
+ " [a, person, is, at, a, diner, ,, ordering, an,... | \n",
"
\n",
" \n",
" 2 | \n",
- " contradiction | \n",
- " two women are embracing while holding to go pa... | \n",
- " the men are fighting outside a deli. | \n",
- " [two, women, are, embracing, while, holding, t... | \n",
- " [the, men, are, fighting, outside, a, deli, .] | \n",
+ " entailment | \n",
+ " a person on a horse jumps over a broken down a... | \n",
+ " a person is outdoors, on a horse. | \n",
+ " [a, person, on, a, horse, jumps, over, a, brok... | \n",
+ " [a, person, is, outdoors, ,, on, a, horse, .] | \n",
"
\n",
" \n",
" 3 | \n",
- " entailment | \n",
- " two young children in blue jerseys, one with t... | \n",
- " two kids in numbered jerseys wash their hands. | \n",
- " [two, young, children, in, blue, jerseys, ,, o... | \n",
- " [two, kids, in, numbered, jerseys, wash, their... | \n",
+ " neutral | \n",
+ " children smiling and waving at camera | \n",
+ " they are smiling at their parents | \n",
+ " [children, smiling, and, waving, at, camera] | \n",
+ " [they, are, smiling, at, their, parents] | \n",
"
\n",
" \n",
" 4 | \n",
- " neutral | \n",
- " two young children in blue jerseys, one with t... | \n",
- " two kids at a ballgame wash their hands. | \n",
- " [two, young, children, in, blue, jerseys, ,, o... | \n",
- " [two, kids, at, a, ballgame, wash, their, hand... | \n",
+ " entailment | \n",
+ " children smiling and waving at camera | \n",
+ " there are children present | \n",
+ " [children, smiling, and, waving, at, camera] | \n",
+ " [there, are, children, present] | \n",
"
\n",
" \n",
"\n",
@@ -1089,41 +518,41 @@
],
"text/plain": [
" score sentence1 \\\n",
- "0 neutral two women are embracing while holding to go pa... \n",
- "1 entailment two women are embracing while holding to go pa... \n",
- "2 contradiction two women are embracing while holding to go pa... \n",
- "3 entailment two young children in blue jerseys, one with t... \n",
- "4 neutral two young children in blue jerseys, one with t... \n",
+ "0 neutral a person on a horse jumps over a broken down a... \n",
+ "1 contradiction a person on a horse jumps over a broken down a... \n",
+ "2 entailment a person on a horse jumps over a broken down a... \n",
+ "3 neutral children smiling and waving at camera \n",
+ "4 entailment children smiling and waving at camera \n",
"\n",
" sentence2 \\\n",
- "0 the sisters are hugging goodbye while holding ... \n",
- "1 two woman are holding packages. \n",
- "2 the men are fighting outside a deli. \n",
- "3 two kids in numbered jerseys wash their hands. \n",
- "4 two kids at a ballgame wash their hands. \n",
+ "0 a person is training his horse for a competition. \n",
+ "1 a person is at a diner, ordering an omelette. \n",
+ "2 a person is outdoors, on a horse. \n",
+ "3 they are smiling at their parents \n",
+ "4 there are children present \n",
"\n",
" sentence1_tokens \\\n",
- "0 [two, women, are, embracing, while, holding, t... \n",
- "1 [two, women, are, embracing, while, holding, t... \n",
- "2 [two, women, are, embracing, while, holding, t... \n",
- "3 [two, young, children, in, blue, jerseys, ,, o... \n",
- "4 [two, young, children, in, blue, jerseys, ,, o... \n",
+ "0 [a, person, on, a, horse, jumps, over, a, brok... \n",
+ "1 [a, person, on, a, horse, jumps, over, a, brok... \n",
+ "2 [a, person, on, a, horse, jumps, over, a, brok... \n",
+ "3 [children, smiling, and, waving, at, camera] \n",
+ "4 [children, smiling, and, waving, at, camera] \n",
"\n",
" sentence2_tokens \n",
- "0 [the, sisters, are, hugging, goodbye, while, h... \n",
- "1 [two, woman, are, holding, packages, .] \n",
- "2 [the, men, are, fighting, outside, a, deli, .] \n",
- "3 [two, kids, in, numbered, jerseys, wash, their... \n",
- "4 [two, kids, at, a, ballgame, wash, their, hand... "
+ "0 [a, person, is, training, his, horse, for, a, ... \n",
+ "1 [a, person, is, at, a, diner, ,, ordering, an,... \n",
+ "2 [a, person, is, outdoors, ,, on, a, horse, .] \n",
+ "3 [they, are, smiling, at, their, parents] \n",
+ "4 [there, are, children, present] "
]
},
- "execution_count": 15,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "dev_tok.head()"
+ "train_tok.head()"
]
},
{
@@ -1138,10 +567,35 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n",
+ "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n",
+ "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n"
+ ]
+ }
+ ],
"source": [
+ "train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
+ "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n",
+ "test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")\n",
+ "\n",
+ "clean_train = clean(train, file_split=\"train\")\n",
+ "clean_dev = clean(dev, file_split=\"dev\")\n",
+ "clean_test = clean(dev, file_split=\"test\")\n",
+ "\n",
+ "train_tok = to_nltk_tokens(to_lowercase(clean_train))\n",
+ "dev_tok = to_nltk_tokens(to_lowercase(clean_dev))\n",
+ "test_tok = to_nltk_tokens(to_lowercase(clean_test))\n",
+ "\n",
"split_map = {'train': train_tok, 'dev': dev_tok, 'test': test_tok}\n",
"for file_split, df in split_map.items():\n",
" base_txt_path = os.path.join(BASE_DATA_PATH, \"clean/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n",
@@ -1155,7 +609,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -1176,13 +630,6 @@
" shutil.move(\"{}.tmp\".format(s1_tok_path), s1_tok_path)\n",
" shutil.move(\"{}.tmp\".format(s2_tok_path), s2_tok_path)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {