diff --git a/examples/sentence_similarity/01-prep-data/snli.ipynb b/examples/sentence_similarity/01-prep-data/snli.ipynb index 847bc59c3..c886b1f10 100644 --- a/examples/sentence_similarity/01-prep-data/snli.ipynb +++ b/examples/sentence_similarity/01-prep-data/snli.ipynb @@ -38,7 +38,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n" + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n", + "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n" ] } ], @@ -90,16 +91,13 @@ "# defaults to txt\n", "train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n", "\n", - "#load dataframe from jsonl file format\n", - "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\", file_type=\"jsonl\")\n", - "\n", - "#specify txt format \n", - "test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\", file_type=\"txt\")\n" + "# or, load dataframe from jsonl\n", + "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\", file_type=\"jsonl\")" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -280,7 +278,7 @@ "4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN " ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -289,422 +287,44 @@ "train.head()" ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
annotator_labelscaptionIDgold_labelpairIDsentence1sentence1_binary_parsesentence1_parsesentence2sentence2_binary_parsesentence2_parse
0[neutral, entailment, neutral, neutral, neutral]4705552913.jpg#2neutral4705552913.jpg#2r1nTwo women are embracing while holding to go pa...( ( Two women ) ( ( are ( embracing ( while ( ...(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...The sisters are hugging goodbye while holding ...( ( The sisters ) ( ( are ( ( hugging goodbye ...(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ...
1[entailment, entailment, entailment, entailmen...4705552913.jpg#2entailment4705552913.jpg#2r1eTwo women are embracing while holding to go pa...( ( Two women ) ( ( are ( embracing ( while ( ...(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...Two woman are holding packages.( ( Two woman ) ( ( are ( holding packages ) )...(ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are...
2[contradiction, contradiction, contradiction, ...4705552913.jpg#2contradiction4705552913.jpg#2r1cTwo women are embracing while holding to go pa...( ( Two women ) ( ( are ( embracing ( while ( ...(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...The men are fighting outside a deli.( ( The men ) ( ( are ( fighting ( outside ( a...(ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)...
3[entailment, entailment, entailment, entailmen...2407214681.jpg#0entailment2407214681.jpg#0r1eTwo young children in blue jerseys, one with t...( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...Two kids in numbered jerseys wash their hands.( ( ( Two kids ) ( in ( numbered jerseys ) ) )...(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...
4[neutral, neutral, neutral, entailment, entail...2407214681.jpg#0neutral2407214681.jpg#0r1nTwo young children in blue jerseys, one with t...( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...Two kids at a ballgame wash their hands.( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w...(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...
\n", - "
" - ], - "text/plain": [ - " annotator_labels captionID \\\n", - "0 [neutral, entailment, neutral, neutral, neutral] 4705552913.jpg#2 \n", - "1 [entailment, entailment, entailment, entailmen... 4705552913.jpg#2 \n", - "2 [contradiction, contradiction, contradiction, ... 4705552913.jpg#2 \n", - "3 [entailment, entailment, entailment, entailmen... 2407214681.jpg#0 \n", - "4 [neutral, neutral, neutral, entailment, entail... 2407214681.jpg#0 \n", - "\n", - " gold_label pairID \\\n", - "0 neutral 4705552913.jpg#2r1n \n", - "1 entailment 4705552913.jpg#2r1e \n", - "2 contradiction 4705552913.jpg#2r1c \n", - "3 entailment 2407214681.jpg#0r1e \n", - "4 neutral 2407214681.jpg#0r1n \n", - "\n", - " sentence1 \\\n", - "0 Two women are embracing while holding to go pa... \n", - "1 Two women are embracing while holding to go pa... \n", - "2 Two women are embracing while holding to go pa... \n", - "3 Two young children in blue jerseys, one with t... \n", - "4 Two young children in blue jerseys, one with t... \n", - "\n", - " sentence1_binary_parse \\\n", - "0 ( ( Two women ) ( ( are ( embracing ( while ( ... \n", - "1 ( ( Two women ) ( ( are ( embracing ( while ( ... \n", - "2 ( ( Two women ) ( ( are ( embracing ( while ( ... \n", - "3 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n", - "4 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n", - "\n", - " sentence1_parse \\\n", - "0 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n", - "1 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n", - "2 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n", - "3 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n", - "4 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n", - "\n", - " sentence2 \\\n", - "0 The sisters are hugging goodbye while holding ... \n", - "1 Two woman are holding packages. \n", - "2 The men are fighting outside a deli. \n", - "3 Two kids in numbered jerseys wash their hands. \n", - "4 Two kids at a ballgame wash their hands. \n", - "\n", - " sentence2_binary_parse \\\n", - "0 ( ( The sisters ) ( ( are ( ( hugging goodbye ... \n", - "1 ( ( Two woman ) ( ( are ( holding packages ) )... \n", - "2 ( ( The men ) ( ( are ( fighting ( outside ( a... \n", - "3 ( ( ( Two kids ) ( in ( numbered jerseys ) ) )... \n", - "4 ( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w... \n", - "\n", - " sentence2_parse \n", - "0 (ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ... \n", - "1 (ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are... \n", - "2 (ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)... \n", - "3 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... \n", - "4 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dev.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gold_labelsentence1_binary_parsesentence2_binary_parsesentence1_parsesentence2_parsesentence1sentence2captionIDpairIDlabel1label2label3label4label5
0neutral( ( This ( church choir ) ) ( ( ( sings ( to (...( ( The church ) ( ( has ( cracks ( in ( the c...(ROOT (S (NP (DT This) (NN church) (NN choir))...(ROOT (S (NP (DT The) (NN church)) (VP (VBZ ha...This church choir sings to the masses as they ...The church has cracks in the ceiling.2677109430.jpg#12677109430.jpg#1r1nneutralcontradictioncontradictionneutralneutral
1entailment( ( This ( church choir ) ) ( ( ( sings ( to (...( ( The church ) ( ( is ( filled ( with song )...(ROOT (S (NP (DT This) (NN church) (NN choir))...(ROOT (S (NP (DT The) (NN church)) (VP (VBZ is...This church choir sings to the masses as they ...The church is filled with song.2677109430.jpg#12677109430.jpg#1r1eentailmententailmententailmentneutralentailment
2contradiction( ( This ( church choir ) ) ( ( ( sings ( to (...( ( ( A choir ) ( singing ( at ( a ( baseball ...(ROOT (S (NP (DT This) (NN church) (NN choir))...(ROOT (NP (NP (DT A) (NN choir)) (VP (VBG sing...This church choir sings to the masses as they ...A choir singing at a baseball game.2677109430.jpg#12677109430.jpg#1r1ccontradictioncontradictioncontradictioncontradictioncontradiction
3neutral( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...( ( The woman ) ( ( is young ) . ) )(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)...(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)...A woman with a green headscarf, blue shirt and...The woman is young.6160193920.jpg#46160193920.jpg#4r1nneutralneutralneutralneutralneutral
4entailment( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...( ( The woman ) ( ( is ( very happy ) ) . ) )(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)...(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)...A woman with a green headscarf, blue shirt and...The woman is very happy.6160193920.jpg#46160193920.jpg#4r1eentailmententailmentcontradictionentailmentneutral
\n", - "
" - ], - "text/plain": [ - " gold_label sentence1_binary_parse \\\n", - "0 neutral ( ( This ( church choir ) ) ( ( ( sings ( to (... \n", - "1 entailment ( ( This ( church choir ) ) ( ( ( sings ( to (... \n", - "2 contradiction ( ( This ( church choir ) ) ( ( ( sings ( to (... \n", - "3 neutral ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... \n", - "4 entailment ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... \n", - "\n", - " sentence2_binary_parse \\\n", - "0 ( ( The church ) ( ( has ( cracks ( in ( the c... \n", - "1 ( ( The church ) ( ( is ( filled ( with song )... \n", - "2 ( ( ( A choir ) ( singing ( at ( a ( baseball ... \n", - "3 ( ( The woman ) ( ( is young ) . ) ) \n", - "4 ( ( The woman ) ( ( is ( very happy ) ) . ) ) \n", - "\n", - " sentence1_parse \\\n", - "0 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n", - "1 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n", - "2 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n", - "3 (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... \n", - "4 (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... \n", - "\n", - " sentence2_parse \\\n", - "0 (ROOT (S (NP (DT The) (NN church)) (VP (VBZ ha... \n", - "1 (ROOT (S (NP (DT The) (NN church)) (VP (VBZ is... \n", - "2 (ROOT (NP (NP (DT A) (NN choir)) (VP (VBG sing... \n", - "3 (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... \n", - "4 (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... \n", - "\n", - " sentence1 \\\n", - "0 This church choir sings to the masses as they ... \n", - "1 This church choir sings to the masses as they ... \n", - "2 This church choir sings to the masses as they ... \n", - "3 A woman with a green headscarf, blue shirt and... \n", - "4 A woman with a green headscarf, blue shirt and... \n", - "\n", - " sentence2 captionID \\\n", - "0 The church has cracks in the ceiling. 2677109430.jpg#1 \n", - "1 The church is filled with song. 2677109430.jpg#1 \n", - "2 A choir singing at a baseball game. 2677109430.jpg#1 \n", - "3 The woman is young. 6160193920.jpg#4 \n", - "4 The woman is very happy. 6160193920.jpg#4 \n", - "\n", - " pairID label1 label2 label3 \\\n", - "0 2677109430.jpg#1r1n neutral contradiction contradiction \n", - "1 2677109430.jpg#1r1e entailment entailment entailment \n", - "2 2677109430.jpg#1r1c contradiction contradiction contradiction \n", - "3 6160193920.jpg#4r1n neutral neutral neutral \n", - "4 6160193920.jpg#4r1e entailment entailment contradiction \n", - "\n", - " label4 label5 \n", - "0 neutral neutral \n", - "1 neutral entailment \n", - "2 contradiction contradiction \n", - "3 neutral neutral \n", - "4 entailment neutral " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "test.head()" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 02 Tokenize\n", "\n", - "We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens.\n", + "Now that we've loaded the data into a pandas.DataFrame, we can tokenize the sentences.\n", "We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def clean(df, file_split):\n", " src_file_path = os.path.join(BASE_DATA_PATH, \"raw/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n", - " return snli.clean_snli(src_file_path).dropna() # drop rows with any NaN vals" + " if not os.path.exists(os.path.join(BASE_DATA_PATH, \"clean/snli_1.0\")):\n", + " os.makedirs(os.path.join(BASE_DATA_PATH, \"clean/snli_1.0\"))\n", + " dest_file_path = os.path.join(BASE_DATA_PATH, \"clean/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n", + " clean_df = snli.clean_snli(src_file_path).dropna() # drop rows with any NaN vals\n", + " clean_df.to_csv(dest_file_path)\n", + " return clean_df" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "train = clean(train, 'train')\n", - "dev = clean(dev, 'dev')\n", - "test = clean(test, 'test')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Glimpse of the data" + "train = clean(train, 'train')" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -784,7 +404,7 @@ "4 There are children present " ] }, - "execution_count": 11, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -793,225 +413,34 @@ "train.head()" ] }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
scoresentence1sentence2
0neutralTwo women are embracing while holding to go pa...The sisters are hugging goodbye while holding ...
1entailmentTwo women are embracing while holding to go pa...Two woman are holding packages.
2contradictionTwo women are embracing while holding to go pa...The men are fighting outside a deli.
3entailmentTwo young children in blue jerseys, one with t...Two kids in numbered jerseys wash their hands.
4neutralTwo young children in blue jerseys, one with t...Two kids at a ballgame wash their hands.
\n", - "
" - ], - "text/plain": [ - " score sentence1 \\\n", - "0 neutral Two women are embracing while holding to go pa... \n", - "1 entailment Two women are embracing while holding to go pa... \n", - "2 contradiction Two women are embracing while holding to go pa... \n", - "3 entailment Two young children in blue jerseys, one with t... \n", - "4 neutral Two young children in blue jerseys, one with t... \n", - "\n", - " sentence2 \n", - "0 The sisters are hugging goodbye while holding ... \n", - "1 Two woman are holding packages. \n", - "2 The men are fighting outside a deli. \n", - "3 Two kids in numbered jerseys wash their hands. \n", - "4 Two kids at a ballgame wash their hands. " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dev.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
scoresentence1sentence2
0neutralThis church choir sings to the masses as they ...The church has cracks in the ceiling.
1entailmentThis church choir sings to the masses as they ...The church is filled with song.
2contradictionThis church choir sings to the masses as they ...A choir singing at a baseball game.
3neutralA woman with a green headscarf, blue shirt and...The woman is young.
4entailmentA woman with a green headscarf, blue shirt and...The woman is very happy.
\n", - "
" - ], - "text/plain": [ - " score sentence1 \\\n", - "0 neutral This church choir sings to the masses as they ... \n", - "1 entailment This church choir sings to the masses as they ... \n", - "2 contradiction This church choir sings to the masses as they ... \n", - "3 neutral A woman with a green headscarf, blue shirt and... \n", - "4 entailment A woman with a green headscarf, blue shirt and... \n", - "\n", - " sentence2 \n", - "0 The church has cracks in the ceiling. \n", - "1 The church is filled with song. \n", - "2 A choir singing at a baseball game. \n", - "3 The woman is young. \n", - "4 The woman is very happy. " - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "test.head()" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization." + "Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK](https://www.nltk.org/) library for tokenization." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[nltk_data] Downloading package punkt to\n", - "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n", - "[nltk_data] Downloading package punkt to\n", - "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n", - "[nltk_data] Downloading package punkt to\n", - "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ - "train_tok = to_nltk_tokens(to_lowercase(train))\n", - "dev_tok = to_nltk_tokens(to_lowercase(dev))\n", - "test_tok = to_nltk_tokens(to_lowercase(test))" + "train_tok = to_nltk_tokens(to_lowercase(train))" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -1046,42 +475,42 @@ " \n", " 0\n", " neutral\n", - " two women are embracing while holding to go pa...\n", - " the sisters are hugging goodbye while holding ...\n", - " [two, women, are, embracing, while, holding, t...\n", - " [the, sisters, are, hugging, goodbye, while, h...\n", + " a person on a horse jumps over a broken down a...\n", + " a person is training his horse for a competition.\n", + " [a, person, on, a, horse, jumps, over, a, brok...\n", + " [a, person, is, training, his, horse, for, a, ...\n", " \n", " \n", " 1\n", - " entailment\n", - " two women are embracing while holding to go pa...\n", - " two woman are holding packages.\n", - " [two, women, are, embracing, while, holding, t...\n", - " [two, woman, are, holding, packages, .]\n", + " contradiction\n", + " a person on a horse jumps over a broken down a...\n", + " a person is at a diner, ordering an omelette.\n", + " [a, person, on, a, horse, jumps, over, a, brok...\n", + " [a, person, is, at, a, diner, ,, ordering, an,...\n", " \n", " \n", " 2\n", - " contradiction\n", - " two women are embracing while holding to go pa...\n", - " the men are fighting outside a deli.\n", - " [two, women, are, embracing, while, holding, t...\n", - " [the, men, are, fighting, outside, a, deli, .]\n", + " entailment\n", + " a person on a horse jumps over a broken down a...\n", + " a person is outdoors, on a horse.\n", + " [a, person, on, a, horse, jumps, over, a, brok...\n", + " [a, person, is, outdoors, ,, on, a, horse, .]\n", " \n", " \n", " 3\n", - " entailment\n", - " two young children in blue jerseys, one with t...\n", - " two kids in numbered jerseys wash their hands.\n", - " [two, young, children, in, blue, jerseys, ,, o...\n", - " [two, kids, in, numbered, jerseys, wash, their...\n", + " neutral\n", + " children smiling and waving at camera\n", + " they are smiling at their parents\n", + " [children, smiling, and, waving, at, camera]\n", + " [they, are, smiling, at, their, parents]\n", " \n", " \n", " 4\n", - " neutral\n", - " two young children in blue jerseys, one with t...\n", - " two kids at a ballgame wash their hands.\n", - " [two, young, children, in, blue, jerseys, ,, o...\n", - " [two, kids, at, a, ballgame, wash, their, hand...\n", + " entailment\n", + " children smiling and waving at camera\n", + " there are children present\n", + " [children, smiling, and, waving, at, camera]\n", + " [there, are, children, present]\n", " \n", " \n", "\n", @@ -1089,41 +518,41 @@ ], "text/plain": [ " score sentence1 \\\n", - "0 neutral two women are embracing while holding to go pa... \n", - "1 entailment two women are embracing while holding to go pa... \n", - "2 contradiction two women are embracing while holding to go pa... \n", - "3 entailment two young children in blue jerseys, one with t... \n", - "4 neutral two young children in blue jerseys, one with t... \n", + "0 neutral a person on a horse jumps over a broken down a... \n", + "1 contradiction a person on a horse jumps over a broken down a... \n", + "2 entailment a person on a horse jumps over a broken down a... \n", + "3 neutral children smiling and waving at camera \n", + "4 entailment children smiling and waving at camera \n", "\n", " sentence2 \\\n", - "0 the sisters are hugging goodbye while holding ... \n", - "1 two woman are holding packages. \n", - "2 the men are fighting outside a deli. \n", - "3 two kids in numbered jerseys wash their hands. \n", - "4 two kids at a ballgame wash their hands. \n", + "0 a person is training his horse for a competition. \n", + "1 a person is at a diner, ordering an omelette. \n", + "2 a person is outdoors, on a horse. \n", + "3 they are smiling at their parents \n", + "4 there are children present \n", "\n", " sentence1_tokens \\\n", - "0 [two, women, are, embracing, while, holding, t... \n", - "1 [two, women, are, embracing, while, holding, t... \n", - "2 [two, women, are, embracing, while, holding, t... \n", - "3 [two, young, children, in, blue, jerseys, ,, o... \n", - "4 [two, young, children, in, blue, jerseys, ,, o... \n", + "0 [a, person, on, a, horse, jumps, over, a, brok... \n", + "1 [a, person, on, a, horse, jumps, over, a, brok... \n", + "2 [a, person, on, a, horse, jumps, over, a, brok... \n", + "3 [children, smiling, and, waving, at, camera] \n", + "4 [children, smiling, and, waving, at, camera] \n", "\n", " sentence2_tokens \n", - "0 [the, sisters, are, hugging, goodbye, while, h... \n", - "1 [two, woman, are, holding, packages, .] \n", - "2 [the, men, are, fighting, outside, a, deli, .] \n", - "3 [two, kids, in, numbered, jerseys, wash, their... \n", - "4 [two, kids, at, a, ballgame, wash, their, hand... " + "0 [a, person, is, training, his, horse, for, a, ... \n", + "1 [a, person, is, at, a, diner, ,, ordering, an,... \n", + "2 [a, person, is, outdoors, ,, on, a, horse, .] \n", + "3 [they, are, smiling, at, their, parents] \n", + "4 [there, are, children, present] " ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "dev_tok.head()" + "train_tok.head()" ] }, { @@ -1138,10 +567,35 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], "source": [ + "train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n", + "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n", + "test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")\n", + "\n", + "clean_train = clean(train, file_split=\"train\")\n", + "clean_dev = clean(dev, file_split=\"dev\")\n", + "clean_test = clean(dev, file_split=\"test\")\n", + "\n", + "train_tok = to_nltk_tokens(to_lowercase(clean_train))\n", + "dev_tok = to_nltk_tokens(to_lowercase(clean_dev))\n", + "test_tok = to_nltk_tokens(to_lowercase(clean_test))\n", + "\n", "split_map = {'train': train_tok, 'dev': dev_tok, 'test': test_tok}\n", "for file_split, df in split_map.items():\n", " base_txt_path = os.path.join(BASE_DATA_PATH, \"clean/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n", @@ -1155,7 +609,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -1176,13 +630,6 @@ " shutil.move(\"{}.tmp\".format(s1_tok_path), s1_tok_path)\n", " shutil.move(\"{}.tmp\".format(s2_tok_path), s2_tok_path)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {