From 217a01f1b77638abd2a5ca9476ea44c216338fa7 Mon Sep 17 00:00:00 2001 From: liesenf Date: Fri, 7 Apr 2023 16:47:28 +0200 Subject: [PATCH 01/12] added test data --- data/ulwa json corpus/conversations.json | 1 + data/ulwa json corpus/corpus.json | 1 + data/ulwa json corpus/index.json | 1 + data/ulwa json corpus/speakers.json | 1 + data/ulwa json corpus/utterances.jsonl | 10 ++++++++++ data/ulwa_testdata_convokit_format.csv | 11 +++++++++++ data/ulwa_testdata_sktalk_format.csv | 11 +++++++++++ data/vamale json corpus/conversations.json | 1 + data/vamale json corpus/corpus.json | 1 + data/vamale json corpus/index.json | 1 + data/vamale json corpus/speakers.json | 1 + data/vamale json corpus/utterances.jsonl | 10 ++++++++++ data/vamale_testdata_convokit_format.csv | 11 +++++++++++ data/vamale_testdata_sktalk_format.csv | 11 +++++++++++ 14 files changed, 72 insertions(+) create mode 100644 data/ulwa json corpus/conversations.json create mode 100644 data/ulwa json corpus/corpus.json create mode 100644 data/ulwa json corpus/index.json create mode 100644 data/ulwa json corpus/speakers.json create mode 100644 data/ulwa json corpus/utterances.jsonl create mode 100644 data/ulwa_testdata_convokit_format.csv create mode 100644 data/ulwa_testdata_sktalk_format.csv create mode 100644 data/vamale json corpus/conversations.json create mode 100644 data/vamale json corpus/corpus.json create mode 100644 data/vamale json corpus/index.json create mode 100644 data/vamale json corpus/speakers.json create mode 100644 data/vamale json corpus/utterances.jsonl create mode 100644 data/vamale_testdata_convokit_format.csv create mode 100644 data/vamale_testdata_sktalk_format.csv diff --git a/data/ulwa json corpus/conversations.json b/data/ulwa json corpus/conversations.json new file mode 100644 index 0000000..d038096 --- /dev/null +++ b/data/ulwa json corpus/conversations.json @@ -0,0 +1 @@ +{"/ulwa1/ulwa014": {"meta": {}, "vectors": []}} \ No newline at end of file diff --git a/data/ulwa json corpus/corpus.json b/data/ulwa json corpus/corpus.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/data/ulwa json corpus/corpus.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/data/ulwa json corpus/index.json b/data/ulwa json corpus/index.json new file mode 100644 index 0000000..5aed485 --- /dev/null +++ b/data/ulwa json corpus/index.json @@ -0,0 +1 @@ +{"utterances-index": {}, "speakers-index": {}, "conversations-index": {}, "overall-index": {}, "version": 1, "vectors": []} \ No newline at end of file diff --git a/data/ulwa json corpus/speakers.json b/data/ulwa json corpus/speakers.json new file mode 100644 index 0000000..468ad1e --- /dev/null +++ b/data/ulwa json corpus/speakers.json @@ -0,0 +1 @@ +{"Tang": {"meta": {}, "vectors": []}, "Yan": {"meta": {}, "vectors": []}} \ No newline at end of file diff --git a/data/ulwa json corpus/utterances.jsonl b/data/ulwa json corpus/utterances.jsonl new file mode 100644 index 0000000..52e8599 --- /dev/null +++ b/data/ulwa json corpus/utterances.jsonl @@ -0,0 +1,10 @@ +{"id": "0", "conversation_id": "/ulwa1/ulwa014", "text": "U oughs inim t\u00ef samting yan", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332718704, "vectors": []} +{"id": "1", "conversation_id": "/ulwa1/ulwa014", "text": "mbam ndul ma wandam ana", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332732704, "vectors": []} +{"id": "2", "conversation_id": "/ulwa1/ulwa014", "text": "M\u00ef inim wandam bai anapa nd", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332743704, "vectors": []} +{"id": "3", "conversation_id": "/ulwa1/ulwa014", "text": "lunda we nd\u00efm\u00efne in", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332754704, "vectors": []} +{"id": "4", "conversation_id": "/ulwa1/ulwa014", "text": "k\u00efnakape ak\u00efnaka", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332765704, "vectors": []} +{"id": "5", "conversation_id": "/ulwa1/ulwa014", "text": "coughs nd\u00efm\u00efne we ndul wa le we nd\u00eft\u00ef ak\u00efnakape malimap mat\u00ef yawa mananda", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332776704, "vectors": []} +{"id": "6", "conversation_id": "/ulwa1/ulwa014", "text": "mananda", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332787704, "vectors": []} +{"id": "7", "conversation_id": "/ulwa1/ulwa014", "text": "da", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332788704, "vectors": []} +{"id": "8", "conversation_id": "/ulwa1/ulwa014", "text": "e k\u00efkal awi ak\u00efnakape", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332789704, "vectors": []} +{"id": "9", "conversation_id": "/ulwa1/ulwa014", "text": "at\u00efm inim.", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332999704, "vectors": []} diff --git a/data/ulwa_testdata_convokit_format.csv b/data/ulwa_testdata_convokit_format.csv new file mode 100644 index 0000000..4e8e5c0 --- /dev/null +++ b/data/ulwa_testdata_convokit_format.csv @@ -0,0 +1,11 @@ +timestamp,speaker,text,translation,conversation_id,utterance_raw,reply_to +1332718704,Tang,U oughs inim tï samting yan,"Lorem ipsum dolor sit amet.",/ulwa1/ulwa014,U oughs inim tï samting yangama ul matï akïnakape,None +1332732704,Yan,mbam ndul ma wandam ana,"At neque fugit eum reprehenderit labore et exercitationem voluptatem. eos odio aspernatur.",/ulwa1/ulwa014,wimbam ndul ma wandam anapa ol welunda nïkap tu mananda yangama,None +1332743704,Tang,Mï inim wandam bai anapa nd,"a veritatis tempore sit vitae quaerat sed consequatur amet qui nisi facilis et perferendis nisi ut maiores consequatur.",/ulwa1/ulwa014,Mï inim wandam bai anapa ndïtï ka welunda unan,None +1332754704,Yan,lunda we ndïmïne in,,/ulwa1/ulwa014,ata welunda we ndïmïne ind,None +1332765704,Tang,kïnakape akïnaka,,/ulwa1/ulwa014,i akïnakape akïnakap,None +1332776704,Yan,coughs ndïmïne we ndul wa le we ndïtï akïnakape malimap matï yawa mananda,"Et illo facere vel magni necessitatibus est aspernatur numquam",/ulwa1/ulwa014,[coughs] I inim oughs ka lopop mananda bai kïkal yangama we ini,None +1332787704,Tang,mananda,,/ulwa1/ulwa014,n mananda ndïtï ka akïnakape wimbam,None +1332788704,Yan,da,,/ulwa1/ulwa014,da ndïtï ka,None +1332789704,Tang,e kïkal awi akïnakape,"onsequatur amet qui nisi facilis et perferendis nisi ut",/ulwa1/ulwa014,e kïkal awi akïnakape manï lï,None +1332999704,Yan,atïm inim.,"itae quaerat sed consequatur amet",/ulwa1/ulwa014,atïm inim.,None diff --git a/data/ulwa_testdata_sktalk_format.csv b/data/ulwa_testdata_sktalk_format.csv new file mode 100644 index 0000000..a8116f6 --- /dev/null +++ b/data/ulwa_testdata_sktalk_format.csv @@ -0,0 +1,11 @@ +begin,end,participant,utterance,translation,source,utterance_raw +00:00:00.917,00:00:05.604,Tang,U oughs inim tï samting yan,"Lorem ipsum dolor sit amet.",/ulwa1/ulwa014,U oughs inim tï samting yangama ul matï akïnakape +00:00:04.830,00:00:09.080,Yan,mbam ndul ma wandam ana,"At neque fugit eum reprehenderit labore et exercitationem voluptatem. eos odio aspernatur.",/ulwa1/ulwa014,wimbam ndul ma wandam anapa ol welunda nïkap tu mananda yangama +00:00:06.090,00:00:09.450,Tang,Mï inim wandam bai anapa nd,"a veritatis tempore sit vitae quaerat sed consequatur amet qui nisi facilis et perferendis nisi ut maiores consequatur.",/ulwa1/ulwa014,Mï inim wandam bai anapa ndïtï ka welunda unan +00:00:09.534,00:00:10.333,Yan,lunda we ndïmïne in,,/ulwa1/ulwa014,ata welunda we ndïmïne ind +00:00:10.333,00:00:11.143,Tang,kïnakape akïnaka,,/ulwa1/ulwa014,i akïnakape akïnakap +00:00:11.143,00:00:18.240,Yan,coughs ndïmïne we ndul wa le we ndïtï akïnakape malimap matï yawa mananda,"Et illo facere vel magni necessitatibus est aspernatur numquam",/ulwa1/ulwa014,[coughs] I inim oughs ka lopop mananda bai kïkal yangama we ini +00:00:11.477,00:00:12.205,Tang,mananda,,/ulwa1/ulwa014,n mananda ndïtï ka akïnakape wimbam +00:00:14.390,00:00:15.696,Yan,da,,/ulwa1/ulwa014,da ndïtï ka +00:00:17.972,00:00:20.722,Tang,e kïkal awi akïnakape,"onsequatur amet qui nisi facilis et perferendis nisi ut",/ulwa1/ulwa014,e kïkal awi akïnakape manï lï +00:00:18.240,00:00:21.970,Yan,atïm inim.,"itae quaerat sed consequatur amet",/ulwa1/ulwa014,atïm inim. diff --git a/data/vamale json corpus/conversations.json b/data/vamale json corpus/conversations.json new file mode 100644 index 0000000..317e1b0 --- /dev/null +++ b/data/vamale json corpus/conversations.json @@ -0,0 +1 @@ +{"/vamale1/vamaleE": {"meta": {}, "vectors": []}, "/vamale1/vamaleie": {"meta": {}, "vectors": []}, "/vamale1/vamale-vie": {"meta": {}, "vectors": []}, "/vamale1/vamale-MS": {"meta": {}, "vectors": []}, "/vamale1/vamaleMS": {"meta": {}, "vectors": []}, "/vamale1/vamalen-MS": {"meta": {}, "vectors": []}, "/vamale1/vamale4": {"meta": {}, "vectors": []}} \ No newline at end of file diff --git a/data/vamale json corpus/corpus.json b/data/vamale json corpus/corpus.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/data/vamale json corpus/corpus.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/data/vamale json corpus/index.json b/data/vamale json corpus/index.json new file mode 100644 index 0000000..5aed485 --- /dev/null +++ b/data/vamale json corpus/index.json @@ -0,0 +1 @@ +{"utterances-index": {}, "speakers-index": {}, "conversations-index": {}, "overall-index": {}, "version": 1, "vectors": []} \ No newline at end of file diff --git a/data/vamale json corpus/speakers.json b/data/vamale json corpus/speakers.json new file mode 100644 index 0000000..1fe10f8 --- /dev/null +++ b/data/vamale json corpus/speakers.json @@ -0,0 +1 @@ +{"Ric": {"meta": {}, "vectors": []}, "Rie": {"meta": {}, "vectors": []}} \ No newline at end of file diff --git a/data/vamale json corpus/utterances.jsonl b/data/vamale json corpus/utterances.jsonl new file mode 100644 index 0000000..c93a26e --- /dev/null +++ b/data/vamale json corpus/utterances.jsonl @@ -0,0 +1,10 @@ +{"id": "0", "conversation_id": "/vamale1/vamaleE", "text": "Aoo angovi \u0263ase aoo vinjo thamauke va nyau yanle?", "speaker": "Ric", "meta": {}, "reply-to": null, "timestamp": 1332712704, "vectors": []} +{"id": "1", "conversation_id": "/vamale1/vamaleie", "text": "ta\u0263ua va thamauke \u0272akoe va angovi \u0263ase.", "speaker": "Rie", "meta": {}, "reply-to": null, "timestamp": 1332722704, "vectors": []} +{"id": "2", "conversation_id": "/vamale1/vamale-vie", "text": "Va vinjo ta\u0263ua daa thanga \u0272akoe daa \u0263ase angovi", "speaker": "Ric", "meta": {}, "reply-to": null, "timestamp": 1332732704, "vectors": []} +{"id": "3", "conversation_id": "/vamale1/vamale-MS", "text": "a nyau thapoke va \u0272akoe", "speaker": "Rie", "meta": {}, "reply-to": null, "timestamp": 1332742704, "vectors": []} +{"id": "4", "conversation_id": "/vamale1/vamaleMS", "text": "\u0272akoe va \u0263ananda konaa va \u0263ananda vinjo ", "speaker": "Ric", "meta": {}, "reply-to": null, "timestamp": 1332755704, "vectors": []} +{"id": "5", "conversation_id": "/vamale1/vamaleMS", "text": "o \u0263ase va angovi angovi nea", "speaker": "Rie", "meta": {}, "reply-to": null, "timestamp": 1332766704, "vectors": []} +{"id": "6", "conversation_id": "/vamale1/vamalen-MS", "text": "au nyau aoo thamauke \u0263ase daa ya", "speaker": "Ric", "meta": {}, "reply-to": null, "timestamp": 1332777704, "vectors": []} +{"id": "7", "conversation_id": "/vamale1/vamaleMS", "text": "uke! Daa \u0263ananda thamauke va \u0263ase t", "speaker": "Rie", "meta": {}, "reply-to": null, "timestamp": 1332788804, "vectors": []} +{"id": "8", "conversation_id": "/vamale1/vamale4", "text": "yanle konaa daa thamauke \u0263ananda va", "speaker": "Ric", "meta": {}, "reply-to": null, "timestamp": 1332799904, "vectors": []} +{"id": "9", "conversation_id": "/vamale1/vamale4", "text": "ga nyau va vinjo konaa daa \u0263anand", "speaker": "Rie", "meta": {}, "reply-to": null, "timestamp": 1332792704, "vectors": []} diff --git a/data/vamale_testdata_convokit_format.csv b/data/vamale_testdata_convokit_format.csv new file mode 100644 index 0000000..299f509 --- /dev/null +++ b/data/vamale_testdata_convokit_format.csv @@ -0,0 +1,11 @@ +timestamp,speaker,text,conversation_id,utterance_raw,reply_to +1332712704,Ric,Aoo angovi ɣase aoo vinjo thamauke va nyau yanle?,/vamale1/vamaleE,Aoo angovi ɣase aoo vinjo thamauke va nyau yanle?,None +1332722704,Rie,taɣua va thamauke ɲakoe va angovi ɣase.,/vamale1/vamaleie,ua va thamauke ɲakoe v,None +1332732704,Ric,Va vinjo taɣua daa thanga ɲakoe daa ɣase angovi,/vamale1/vamale-vie,aa thanga ɲakoe daa ɣase,None +1332742704,Rie,a nyau thapoke va ɲakoe,/vamale1/vamale-MS,thapoke va ɲak,None +1332755704,Ric,ɲakoe va ɣananda konaa va ɣananda vinjo ,/vamale1/vamaleMS,ɣnaa va ɣananda vin,None +1332766704,Rie,o ɣase va angovi angovi nea,/vamale1/vamaleMS,va angovi angovi,None +1332777704,Ric,au nyau aoo thamauke ɣase daa ya,/vamale1/vamalen-MS,limyau aoo thamauke ɣase daaa,None +1332788804,Rie,uke! Daa ɣananda thamauke va ɣase t,/vamale1/vamaleMS,ke! Daa ɣananda tham,None +1332799904,Ric,yanle konaa daa thamauke ɣananda va,/vamale1/vamale4,konaa daa,None +1332792704,Rie,ga nyau va vinjo konaa daa ɣanand,/vamale1/vamale4,au va vinjo konaa daa ɣa,None diff --git a/data/vamale_testdata_sktalk_format.csv b/data/vamale_testdata_sktalk_format.csv new file mode 100644 index 0000000..becde13 --- /dev/null +++ b/data/vamale_testdata_sktalk_format.csv @@ -0,0 +1,11 @@ +begin,end,participant,utterance,translation,source,utterance_raw +00:01:33.740,00:01:36.200,Ric,Aoo angovi ɣase aoo vinjo thamauke va nyau yanle?,,/vamale1/vamaleE,Aoo angovi ɣase aoo vinjo thamauke va nyau yanle? +00:02:25.065,00:02:26.445,Rie,taɣua va thamauke ɲakoe va angovi ɣase.,,/vamale1/vamaleie,ua va thamauke ɲakoe v +00:02:26.385,00:02:27.925,Ric,Va vinjo taɣua daa thanga ɲakoe daa ɣase angovi,,/vamale1/vamale-vie,aa thanga ɲakoe daa ɣase +00:03:03.530,00:03:05.130,Rie,a nyau thapoke va ɲakoe,,/vamale1/vamale-MS,thapoke va ɲak +00:03:05.595,00:03:08.355,Ric,ɲakoe va ɣananda konaa va ɣananda vinjo ,,/vamale1/vamaleMS,ɣnaa va ɣananda vin +00:03:08.710,00:03:12.720,Rie,o ɣase va angovi angovi nea,,/vamale1/vamaleMS,va angovi angovi +00:03:12.950,00:03:16.760,Ric,au nyau aoo thamauke ɣase daa ya,,/vamale1/vamalen-MS,limyau aoo thamauke ɣase daaa +00:03:17.070,00:03:19.470,Rie,uke! Daa ɣananda thamauke va ɣase t,,/vamale1/vamaleMS,ke! Daa ɣananda tham +00:15:44.815,00:15:46.395,Ric,yanle konaa daa thamauke ɣananda va,,/vamale1/vamale4,konaa daa +00:26:03.605,00:26:05.345,Rie,ga nyau va vinjo konaa daa ɣanand,,/vamale1/vamale4,au va vinjo konaa daa ɣa From 8669e01eccf8408c81f988b921ca2eeb34d779db Mon Sep 17 00:00:00 2001 From: liesenf Date: Fri, 7 Apr 2023 17:08:25 +0200 Subject: [PATCH 02/12] added first class --- notebooks/exploration.ipynb | 235 ++++++++++++++++++++++++++++++++---- sktalk/csv_to_json.py | 13 ++ 2 files changed, 227 insertions(+), 21 deletions(-) create mode 100644 sktalk/csv_to_json.py diff --git a/notebooks/exploration.ipynb b/notebooks/exploration.ipynb index 5949ffa..d4db4bf 100644 --- a/notebooks/exploration.ipynb +++ b/notebooks/exploration.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 27, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -25,58 +25,251 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sktalk.csv_to_json as cj\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The `Demo` module\n", + "\n", + "Here, we import a single class from the `demo` module, and use it." + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beginendparticipantutterancetranslationsourceutterance_raw
000:00:00.91700:00:05.604TangU oughs inim tï samting yanLorem ipsum dolor sit amet./ulwa1/ulwa014U oughs inim tï samting yangama ul matï akïnakape
100:00:04.83000:00:09.080Yanmbam ndul ma wandam anaAt neque fugit eum reprehenderit labore et exe.../ulwa1/ulwa014wimbam ndul ma wandam anapa ol welunda nïkap t...
200:00:06.09000:00:09.450TangMï inim wandam bai anapa nda veritatis tempore sit vitae quaerat sed cons.../ulwa1/ulwa014Mï inim wandam bai anapa ndïtï ka welunda unan
300:00:09.53400:00:10.333Yanlunda we ndïmïne inNaN/ulwa1/ulwa014ata welunda we ndïmïne ind
400:00:10.33300:00:11.143Tangkïnakape akïnakaNaN/ulwa1/ulwa014i akïnakape akïnakap
500:00:11.14300:00:18.240Yancoughs ndïmïne we ndul wa le we ndïtï akïnakap...Et illo facere vel magni necessitatibus est as.../ulwa1/ulwa014[coughs] I inim oughs ka lopop mananda bai kïk...
600:00:11.47700:00:12.205TangmanandaNaN/ulwa1/ulwa014n mananda ndïtï ka akïnakape wimbam
700:00:14.39000:00:15.696YandaNaN/ulwa1/ulwa014da ndïtï ka
800:00:17.97200:00:20.722Tange kïkal awi akïnakapeonsequatur amet qui nisi facilis et perferendi.../ulwa1/ulwa014e kïkal awi akïnakape manï lï
900:00:18.24000:00:21.970Yanatïm inim.itae quaerat sed consequatur amet/ulwa1/ulwa014atïm inim.
\n", + "
" + ], "text/plain": [ - "'Hello Person!'" + " begin end participant \\\n", + "0 00:00:00.917 00:00:05.604 Tang \n", + "1 00:00:04.830 00:00:09.080 Yan \n", + "2 00:00:06.090 00:00:09.450 Tang \n", + "3 00:00:09.534 00:00:10.333 Yan \n", + "4 00:00:10.333 00:00:11.143 Tang \n", + "5 00:00:11.143 00:00:18.240 Yan \n", + "6 00:00:11.477 00:00:12.205 Tang \n", + "7 00:00:14.390 00:00:15.696 Yan \n", + "8 00:00:17.972 00:00:20.722 Tang \n", + "9 00:00:18.240 00:00:21.970 Yan \n", + "\n", + " utterance \\\n", + "0 U oughs inim tï samting yan \n", + "1 mbam ndul ma wandam ana \n", + "2 Mï inim wandam bai anapa nd \n", + "3 lunda we ndïmïne in \n", + "4 kïnakape akïnaka \n", + "5 coughs ndïmïne we ndul wa le we ndïtï akïnakap... \n", + "6 mananda \n", + "7 da \n", + "8 e kïkal awi akïnakape \n", + "9 atïm inim. \n", + "\n", + " translation source \\\n", + "0 Lorem ipsum dolor sit amet. /ulwa1/ulwa014 \n", + "1 At neque fugit eum reprehenderit labore et exe... /ulwa1/ulwa014 \n", + "2 a veritatis tempore sit vitae quaerat sed cons... /ulwa1/ulwa014 \n", + "3 NaN /ulwa1/ulwa014 \n", + "4 NaN /ulwa1/ulwa014 \n", + "5 Et illo facere vel magni necessitatibus est as... /ulwa1/ulwa014 \n", + "6 NaN /ulwa1/ulwa014 \n", + "7 NaN /ulwa1/ulwa014 \n", + "8 onsequatur amet qui nisi facilis et perferendi... /ulwa1/ulwa014 \n", + "9 itae quaerat sed consequatur amet /ulwa1/ulwa014 \n", + "\n", + " utterance_raw \n", + "0 U oughs inim tï samting yangama ul matï akïnakape \n", + "1 wimbam ndul ma wandam anapa ol welunda nïkap t... \n", + "2 Mï inim wandam bai anapa ndïtï ka welunda unan \n", + "3 ata welunda we ndïmïne ind \n", + "4 i akïnakape akïnakap \n", + "5 [coughs] I inim oughs ka lopop mananda bai kïk... \n", + "6 n mananda ndïtï ka akïnakape wimbam \n", + "7 da ndïtï ka \n", + "8 e kïkal awi akïnakape manï lï \n", + "9 atïm inim. " ] }, - "execution_count": 47, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import sktalk.my_module as mod\n", - "\n", - "mod.hello(\"Person\")\n" + "corpus = cj.Corpus(\"../data/ulwa_testdata_sktalk_format.csv\")\n", + "corpus.return_dataframe()\n", + "corpus.df" ] }, { - "attachments": {}, - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 5, "metadata": {}, + "outputs": [], "source": [ - "## The `Demo` module\n", - "\n", - "Here, we import a single class from the `demo` module, and use it." + "corpus.return_json()" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'Person'" + "'{\"begin\":{\"0\":\"00:00:00.917\",\"1\":\"00:00:04.830\",\"2\":\"00:00:06.090\",\"3\":\"00:00:09.534\",\"4\":\"00:00:10.333\",\"5\":\"00:00:11.143\",\"6\":\"00:00:11.477\",\"7\":\"00:00:14.390\",\"8\":\"00:00:17.972\",\"9\":\"00:00:18.240\"},\"end\":{\"0\":\"00:00:05.604\",\"1\":\"00:00:09.080\",\"2\":\"00:00:09.450\",\"3\":\"00:00:10.333\",\"4\":\"00:00:11.143\",\"5\":\"00:00:18.240\",\"6\":\"00:00:12.205\",\"7\":\"00:00:15.696\",\"8\":\"00:00:20.722\",\"9\":\"00:00:21.970\"},\"participant\":{\"0\":\"Tang\",\"1\":\"Yan\",\"2\":\"Tang\",\"3\":\"Yan\",\"4\":\"Tang\",\"5\":\"Yan\",\"6\":\"Tang\",\"7\":\"Yan\",\"8\":\"Tang\",\"9\":\"Yan\"},\"utterance\":{\"0\":\"U oughs inim t\\\\u00ef samting yan\",\"1\":\"mbam ndul ma wandam ana\",\"2\":\"M\\\\u00ef inim wandam bai anapa nd\",\"3\":\"lunda we nd\\\\u00efm\\\\u00efne in\",\"4\":\"k\\\\u00efnakape ak\\\\u00efnaka\",\"5\":\"coughs nd\\\\u00efm\\\\u00efne we ndul wa le we nd\\\\u00eft\\\\u00ef ak\\\\u00efnakape malimap mat\\\\u00ef yawa mananda\",\"6\":\"mananda\",\"7\":\"da\",\"8\":\"e k\\\\u00efkal awi ak\\\\u00efnakape\",\"9\":\"at\\\\u00efm inim.\"},\"translation\":{\"0\":\"Lorem ipsum dolor sit amet.\",\"1\":\"At neque fugit eum reprehenderit labore et exercitationem voluptatem. eos odio aspernatur.\",\"2\":\"a veritatis tempore sit vitae quaerat sed consequatur amet qui nisi facilis et perferendis nisi ut maiores consequatur.\",\"3\":null,\"4\":null,\"5\":\"Et illo facere vel magni necessitatibus est aspernatur numquam\",\"6\":null,\"7\":null,\"8\":\"onsequatur amet qui nisi facilis et perferendis nisi ut\",\"9\":\"itae quaerat sed consequatur amet\"},\"source\":{\"0\":\"\\\\/ulwa1\\\\/ulwa014\",\"1\":\"\\\\/ulwa1\\\\/ulwa014\",\"2\":\"\\\\/ulwa1\\\\/ulwa014\",\"3\":\"\\\\/ulwa1\\\\/ulwa014\",\"4\":\"\\\\/ulwa1\\\\/ulwa014\",\"5\":\"\\\\/ulwa1\\\\/ulwa014\",\"6\":\"\\\\/ulwa1\\\\/ulwa014\",\"7\":\"\\\\/ulwa1\\\\/ulwa014\",\"8\":\"\\\\/ulwa1\\\\/ulwa014\",\"9\":\"\\\\/ulwa1\\\\/ulwa014\"},\"utterance_raw\":{\"0\":\"U oughs inim t\\\\u00ef samting yangama ul mat\\\\u00ef ak\\\\u00efnakape\",\"1\":\"wimbam ndul ma wandam anapa ol welunda n\\\\u00efkap tu mananda yangama \",\"2\":\"M\\\\u00ef inim wandam bai anapa nd\\\\u00eft\\\\u00ef ka welunda unan\",\"3\":\"ata welunda we nd\\\\u00efm\\\\u00efne ind\",\"4\":\"i ak\\\\u00efnakape ak\\\\u00efnakap\",\"5\":\"[coughs] I inim oughs ka lopop mananda bai k\\\\u00efkal yangama we ini\",\"6\":\"n mananda nd\\\\u00eft\\\\u00ef ka ak\\\\u00efnakape wimbam\",\"7\":\"da nd\\\\u00eft\\\\u00ef ka\",\"8\":\"e k\\\\u00efkal awi ak\\\\u00efnakape man\\\\u00ef l\\\\u00ef\",\"9\":\"at\\\\u00efm inim.\"}}'" ] }, - "execution_count": 48, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from sktalk.demo import Demo\n", - "\n", - "person = Demo(\"Person\")\n", - "\n", - "person.name" + "corpus.json" ] } ], @@ -96,7 +289,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.10" }, "orig_nbformat": 4, "vscode": { diff --git a/sktalk/csv_to_json.py b/sktalk/csv_to_json.py new file mode 100644 index 0000000..84e66c1 --- /dev/null +++ b/sktalk/csv_to_json.py @@ -0,0 +1,13 @@ +"""Documentation about the scikit-talk module - csv to json.""" +import pandas as pd + +class Corpus: + def __init__(self,path): + self.path = path + + def return_dataframe(self): + self.df = pd.read_csv(self.path) + + def return_json(self): + self.return_dataframe() + self.json = self.df.to_json() \ No newline at end of file From 2975c672027592e175132e3e68a11cb6abd3fb58 Mon Sep 17 00:00:00 2001 From: liesenf Date: Fri, 7 Apr 2023 17:17:30 +0200 Subject: [PATCH 03/12] added pandas to package requirements --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 4c5a3a3..2af455f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,7 +35,8 @@ zip_safe = False python_requires = >=3.7 include_package_data = True packages = find: -install_requires = +install_requires = + pandas [options.data_files] # This section requires setuptools>=40.6.0 From 0c31614ef09f3eab371f553be638010eab5b9ff5 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 7 Apr 2023 19:36:45 +0200 Subject: [PATCH 04/12] fix linter issues --- sktalk/csv_to_json.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sktalk/csv_to_json.py b/sktalk/csv_to_json.py index 84e66c1..7abaa56 100644 --- a/sktalk/csv_to_json.py +++ b/sktalk/csv_to_json.py @@ -1,13 +1,14 @@ """Documentation about the scikit-talk module - csv to json.""" import pandas as pd + class Corpus: - def __init__(self,path): + def __init__(self, path): self.path = path - + def return_dataframe(self): self.df = pd.read_csv(self.path) def return_json(self): self.return_dataframe() - self.json = self.df.to_json() \ No newline at end of file + self.json = self.df.to_json() From bda6ed0d81f0de835fd6926b97622b9b7acb5ab2 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 7 Apr 2023 20:05:17 +0200 Subject: [PATCH 05/12] remove link checker config --- .mlc-config.json | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 .mlc-config.json diff --git a/.mlc-config.json b/.mlc-config.json deleted file mode 100644 index 1d38867..0000000 --- a/.mlc-config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "_comment": "Markdown Link Checker configuration, see https://github.com/gaurav-nelson/github-action-markdown-link-check and https://github.com/tcort/markdown-link-check", - "ignorePatterns": [ - { - "pattern": "^http://localhost" - }, - { - "pattern": "^https://doi.org/" - }, - { - "pattern": "^https://github.com/.*/settings/secrets/actions$" - }, - { - "pattern": "^https://github.com/organizations/.*/repositories/new" - }, - { - "pattern": "^https://test.pypi.org" - }, - { - "pattern": "^https://bestpractices.coreinfrastructure.org/projects/" - }, - { - "pattern": "^https://readthedocs.org/dashboard/import.*" - } - ], - "replacementPatterns": [ - ], - "retryOn429": true, - "timeout": "20s" -} From d827c7e6d4ead175792403b1ca7c510988da81bd Mon Sep 17 00:00:00 2001 From: liesenf Date: Wed, 12 Apr 2023 13:15:06 +0200 Subject: [PATCH 06/12] added "from_convokit" function that showcases desired funcationality to parse and write corpora --- data/testcorpus/conversations.json | 1 + data/testcorpus/corpus.json | 1 + data/testcorpus/index.json | 1 + data/testcorpus/speakers.json | 1 + data/testcorpus/utterances.jsonl | 10 + notebooks/exploration.ipynb | 14 +- sktalk/from_convokit.py | 2425 ++++++++++++++++++++++++++++ 7 files changed, 2441 insertions(+), 12 deletions(-) create mode 100644 data/testcorpus/conversations.json create mode 100644 data/testcorpus/corpus.json create mode 100644 data/testcorpus/index.json create mode 100644 data/testcorpus/speakers.json create mode 100644 data/testcorpus/utterances.jsonl create mode 100644 sktalk/from_convokit.py diff --git a/data/testcorpus/conversations.json b/data/testcorpus/conversations.json new file mode 100644 index 0000000..d038096 --- /dev/null +++ b/data/testcorpus/conversations.json @@ -0,0 +1 @@ +{"/ulwa1/ulwa014": {"meta": {}, "vectors": []}} \ No newline at end of file diff --git a/data/testcorpus/corpus.json b/data/testcorpus/corpus.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/data/testcorpus/corpus.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/data/testcorpus/index.json b/data/testcorpus/index.json new file mode 100644 index 0000000..5aed485 --- /dev/null +++ b/data/testcorpus/index.json @@ -0,0 +1 @@ +{"utterances-index": {}, "speakers-index": {}, "conversations-index": {}, "overall-index": {}, "version": 1, "vectors": []} \ No newline at end of file diff --git a/data/testcorpus/speakers.json b/data/testcorpus/speakers.json new file mode 100644 index 0000000..468ad1e --- /dev/null +++ b/data/testcorpus/speakers.json @@ -0,0 +1 @@ +{"Tang": {"meta": {}, "vectors": []}, "Yan": {"meta": {}, "vectors": []}} \ No newline at end of file diff --git a/data/testcorpus/utterances.jsonl b/data/testcorpus/utterances.jsonl new file mode 100644 index 0000000..52e8599 --- /dev/null +++ b/data/testcorpus/utterances.jsonl @@ -0,0 +1,10 @@ +{"id": "0", "conversation_id": "/ulwa1/ulwa014", "text": "U oughs inim t\u00ef samting yan", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332718704, "vectors": []} +{"id": "1", "conversation_id": "/ulwa1/ulwa014", "text": "mbam ndul ma wandam ana", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332732704, "vectors": []} +{"id": "2", "conversation_id": "/ulwa1/ulwa014", "text": "M\u00ef inim wandam bai anapa nd", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332743704, "vectors": []} +{"id": "3", "conversation_id": "/ulwa1/ulwa014", "text": "lunda we nd\u00efm\u00efne in", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332754704, "vectors": []} +{"id": "4", "conversation_id": "/ulwa1/ulwa014", "text": "k\u00efnakape ak\u00efnaka", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332765704, "vectors": []} +{"id": "5", "conversation_id": "/ulwa1/ulwa014", "text": "coughs nd\u00efm\u00efne we ndul wa le we nd\u00eft\u00ef ak\u00efnakape malimap mat\u00ef yawa mananda", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332776704, "vectors": []} +{"id": "6", "conversation_id": "/ulwa1/ulwa014", "text": "mananda", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332787704, "vectors": []} +{"id": "7", "conversation_id": "/ulwa1/ulwa014", "text": "da", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332788704, "vectors": []} +{"id": "8", "conversation_id": "/ulwa1/ulwa014", "text": "e k\u00efkal awi ak\u00efnakape", "speaker": "Tang", "meta": {}, "reply-to": null, "timestamp": 1332789704, "vectors": []} +{"id": "9", "conversation_id": "/ulwa1/ulwa014", "text": "at\u00efm inim.", "speaker": "Yan", "meta": {}, "reply-to": null, "timestamp": 1332999704, "vectors": []} diff --git a/notebooks/exploration.ipynb b/notebooks/exploration.ipynb index d4db4bf..9fed698 100644 --- a/notebooks/exploration.ipynb +++ b/notebooks/exploration.ipynb @@ -18,9 +18,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## The `my_module` module\n", + "## The `csv_to_json` module\n", "\n", - "Import and use the `hello()` method:" + "Here, we import a single class from the `csv_to_json` module, and use it." ] }, { @@ -32,16 +32,6 @@ "import sktalk.csv_to_json as cj\n" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## The `Demo` module\n", - "\n", - "Here, we import a single class from the `demo` module, and use it." - ] - }, { "cell_type": "code", "execution_count": 3, diff --git a/sktalk/from_convokit.py b/sktalk/from_convokit.py new file mode 100644 index 0000000..aa242fa --- /dev/null +++ b/sktalk/from_convokit.py @@ -0,0 +1,2425 @@ +"""Documentation about the scikit-talk module - csv to json.""" +import json +import pandas as pd +from pandas import DataFrame +# import random +# import shutil +# from typing import Collection, Callable, Set, Generator, Tuple, ValuesView, Union +from typing import Optional, List, Dict, Union, Iterable, Callable, Generator +# from pandas import DataFrame +from tqdm import tqdm +from abc import ABCMeta, abstractmethod +from collections import defaultdict, deque +import os +from yaml import load, Loader + +try: + from collections.abc import MutableMapping +except: + from collections import MutableMapping + +# from convokit.convokitConfig import ConvoKitConfig +# from convokit.util import create_safe_id +# from .convoKitMatrix import ConvoKitMatrix +# from .corpusUtil import * +# from .corpus_helpers import * +# from .storageManager import StorageManager + + +### CLASSES + +class CorpusComponent: + def __init__( + self, + obj_type: str, + owner=None, + id=None, + initial_data=None, + vectors: List[str] = None, + meta=None, + ): + self.obj_type = obj_type # utterance, speaker, conversation + self._owner = owner + self._id = id + self.vectors = vectors if vectors is not None else [] + + # if the CorpusComponent is initialized with an owner set up an entry + # in the owner's storage; if it is not initialized with an owner + # (i.e. it is a standalone object) set up a dict-based temp storage + if self.owner is None: + self._temp_storage = initial_data if initial_data is not None else {} + else: + self.owner.storage.initialize_data_for_component( + self.obj_type, + self._id, + initial_value=(initial_data if initial_data is not None else {}), + ) + + if meta is None: + meta = dict() + self._meta = self.init_meta(meta) + + def get_owner(self): + return self._owner + + def set_owner(self, owner): + if owner is self._owner: + # no action needed + return + # stash the metadata first since reassigning self._owner will break its storage connection + meta_vals = {k: v for k, v in self.meta.items()} + previous_owner = self._owner + self._owner = owner + if owner is not None: + # when a new owner Corpus is assigned, we must take the following steps: + # (1) transfer this component's data to the new owner's StorageManager + # (2) avoid duplicates by removing the data from the old owner (or temp storage if there was no prior owner) + # (3) reinitialize the metadata instance + data_dict = ( + dict(previous_owner.storage.get_data(self.obj_type, self.id)) + if previous_owner is not None + else self._temp_storage + ) + self.owner.storage.initialize_data_for_component( + self.obj_type, self.id, initial_value=data_dict + ) + if previous_owner is not None: + previous_owner.storage.delete_data(self.obj_type, self.id) + previous_owner.storage.delete_data("meta", self.meta.storage_key) + else: + del self._temp_storage + self._meta = self.init_meta(meta_vals) + + owner = property(get_owner, set_owner) + + def init_meta(self, meta, overwrite=False): + if self._owner is None: + # ConvoKitMeta instances are not allowed for ownerless (standalone) + # components since they must be backed by a StorageManager. In this + # case we must forcibly convert the ConvoKitMeta instance to dict + if isinstance(meta, ConvoKitMeta): + meta = meta.to_dict() + return meta + else: + if isinstance(meta, ConvoKitMeta) and meta.owner is self._owner: + return meta + ck_meta = ConvoKitMeta(self, self.owner.meta_index, self.obj_type, overwrite=overwrite) + for key, value in meta.items(): + ck_meta[key] = value + return ck_meta + + def get_id(self): + return self._id + + def set_id(self, value): + if not isinstance(value, str) and value is not None: + self._id = str(value) + warn( + "{} id must be a string. ID input has been casted to a string.".format( + self.obj_type + ) + ) + else: + self._id = value + + id = property(get_id, set_id) + + def get_meta(self): + return self._meta + + def set_meta(self, new_meta): + self._meta = self.init_meta(new_meta, overwrite=True) + + meta = property(get_meta, set_meta) + + def get_data(self, property_name): + if self._owner is None: + return self._temp_storage[property_name] + return self.owner.storage.get_data(self.obj_type, self.id, property_name) + + def set_data(self, property_name, value): + if self._owner is None: + self._temp_storage[property_name] = value + else: + self.owner.storage.update_data(self.obj_type, self.id, property_name, value) + + # def __eq__(self, other): + # if type(self) != type(other): return False + # # do not compare 'utterances' and 'conversations' in Speaker.__dict__. recursion loop will occur. + # self_keys = set(self.__dict__).difference(['_owner', 'meta', 'utterances', 'conversations']) + # other_keys = set(other.__dict__).difference(['_owner', 'meta', 'utterances', 'conversations']) + # return self_keys == other_keys and all([self.__dict__[k] == other.__dict__[k] for k in self_keys]) + + def retrieve_meta(self, key: str): + """ + Retrieves a value stored under the key of the metadata of corpus object + :param key: name of metadata attribute + :return: value + """ + return self.meta.get(key, None) + + def add_meta(self, key: str, value) -> None: + """ + Adds a key-value pair to the metadata of the corpus object + :param key: name of metadata attribute + :param value: value of metadata attribute + :return: None + """ + self.meta[key] = value + + def get_vector( + self, vector_name: str, as_dataframe: bool = False, columns: Optional[List[str]] = None + ): + """ + Get the vector stored as `vector_name` for this object. + :param vector_name: name of vector + :param as_dataframe: whether to return the vector as a dataframe (True) or in its raw array form (False). False + by default. + :param columns: optional list of named columns of the vector to include. All columns returned otherwise. This + parameter is only used if as_dataframe is set to True + :return: a numpy / scipy array + """ + if vector_name not in self.vectors: + raise ValueError( + "This {} has no vector stored as '{}'.".format(self.obj_type, vector_name) + ) + + return self.owner.get_vector_matrix(vector_name).get_vectors( + ids=[self.id], as_dataframe=as_dataframe, columns=columns + ) + + def add_vector(self, vector_name: str): + """ + Logs in the Corpus component object's internal vectors list that the component object has a vector row + associated with it in the vector matrix named `vector_name`. + Transformers that add vectors to the Corpus should use this to update the relevant component objects during + the transform() step. + :param vector_name: name of vector matrix + :return: None + """ + if vector_name not in self.vectors: + self.vectors.append(vector_name) + + def has_vector(self, vector_name: str): + return vector_name in self.vectors + + def delete_vector(self, vector_name: str): + """ + Delete a vector associated with this Corpus component object. + :param vector_name: + :return: None + """ + self.vectors.remove(vector_name) + + def to_dict(self): + return { + "id": self.id, + "vectors": self.vectors, + "meta": self.meta if type(self.meta) == dict else self.meta.to_dict(), + } + + def __str__(self): + return "{}(id: {}, vectors: {}, meta: {})".format( + self.obj_type.capitalize(), self.id, self.vectors, self.meta + ) + + def __hash__(self): + return hash(self.obj_type + str(self.id)) + + def __repr__(self): + copy = self.__dict__.copy() + deleted_keys = [ + "utterances", + "conversations", + "user", + "_root", + "_utterance_ids", + "_speaker_ids", + ] + for k in deleted_keys: + if k in copy: + del copy[k] + + to_delete = [k for k in copy if k.startswith("_")] + to_add = {k[1:]: copy[k] for k in copy if k.startswith("_")} + + for k in to_delete: + del copy[k] + + copy.update(to_add) + + try: + return self.obj_type.capitalize() + "(" + str(copy) + ")" + except ( + AttributeError + ): # for backwards compatibility when corpus objects are saved as binary data, e.g. wikiconv + return "(" + str(copy) + ")" + + +class StorageManager(metaclass=ABCMeta): + """ + Abstraction layer for the concrete representation of data and metadata + within corpus components (e.g., Utterance text and timestamps). All requests + to access or modify corpusComponent fields (with the exception of ID) are + actually routed through one of StorageManager's concrete subclasses. Each + subclass implements a storage backend that contains the actual data. + """ + + def __init__(self): + # concrete data storage (i.e., collections) for each component type + # this will be assigned in subclasses + self.data = {"utterance": None, "conversation": None, "speaker": None, "meta": None} + + @abstractmethod + def get_collection_ids(self, component_type: str): + """ + Returns a list of all object IDs within the component_type collection + """ + return NotImplemented + + @abstractmethod + def has_data_for_component(self, component_type: str, component_id: str) -> bool: + """ + Check if there is an existing entry for the component of type component_type + with id component_id + """ + return NotImplemented + + @abstractmethod + def initialize_data_for_component( + self, component_type: str, component_id: str, overwrite: bool = False, initial_value=None + ): + """ + Create a blank entry for a component of type component_type with id + component_id. Will avoid overwriting any existing data unless the + overwrite parameter is set to True. + """ + return NotImplemented + + @abstractmethod + def get_data( + self, + component_type: str, + component_id: str, + property_name: Optional[str] = None, + index=None, + ): + """ + Retrieve the property data for the component of type component_type with + id component_id. If property_name is specified return only the data for + that property, otherwise return the dict containing all properties. + Additionally, the expected type of the property to be fetched may be specified + as a string; this is meant to be used for metadata in conjunction with the index. + """ + return NotImplemented + + @abstractmethod + def update_data( + self, + component_type: str, + component_id: str, + property_name: str, + new_value, + index=None, + ): + """ + Set or update the property data for the component of type component_type + with id component_id. For metadata, the Python object type may also be + specified, to be used in conjunction with the index. + """ + return NotImplemented + + @abstractmethod + def delete_data( + self, component_type: str, component_id: str, property_name: Optional[str] = None + ): + """ + Delete a data entry from this StorageManager for the component of type + component_type with id component_id. If property_name is specified + delete only that property, otherwise delete the entire entry. + """ + return NotImplemented + + @abstractmethod + def clear_all_data(self): + """ + Erase all data from this StorageManager (i.e., reset self.data to its + initial empty state; Python will garbage-collect the now-unreferenced + old data entries). This is used for cleanup after destructive Corpus + operations. + """ + return NotImplemented + + @abstractmethod + def count_entries(self, component_type: str): + """ + Count the number of entries held for the specified component type by + this StorageManager instance + """ + return NotImplemented + + def get_collection(self, component_type: str): + if component_type not in self.data: + raise ValueError( + 'component_type must be one of "utterance", "conversation", "speaker", or "meta".' + ) + return self.data[component_type] + + def purge_obsolete_entries(self, utterance_ids, conversation_ids, speaker_ids, meta_ids): + """ + Compare the entries in this StorageManager to the existing component ids + provided as parameters, and delete any entries that are not found in the + parameter ids. + """ + ref_ids = { + "utterance": set(utterance_ids), + "conversation": set(conversation_ids), + "speaker": set(speaker_ids), + "meta": set(meta_ids), + } + for obj_type in self.data: + for obj_id in self.get_collection_ids(obj_type): + if obj_id not in ref_ids[obj_type]: + self.delete_data(obj_type, obj_id) + +class Speaker(CorpusComponent): + """ + Represents a single speaker in a dataset. + :param id: id of the speaker. + :type id: str + :param utts: dictionary of utterances by the speaker, where key is utterance id + :param convos: dictionary of conversations started by the speaker, where key is conversation id + :param meta: arbitrary dictionary of attributes associated + with the speaker. + :type meta: dict + :ivar id: id of the speaker. + :ivar meta: A dictionary-like view object providing read-write access to + speaker-level metadata. + """ + + def __init__( + self, + owner=None, + id: str = None, + utts=None, + convos=None, + meta: Optional[Dict] = None, + ): + super().__init__(obj_type="speaker", owner=owner, id=id, meta=meta) + self.utterances = utts if utts is not None else dict() + self.conversations = convos if convos is not None else dict() + # self._split_attribs = set() + # self._update_uid() + + # def identify_by_attribs(self, attribs: Collection) -> None: + # """Identify a speaker by a list of attributes. Sets which speaker info + # attributes should distinguish speakers of the same name in equality tests. + # For example, in the Supreme Court dataset, speakers are labeled with the + # current case id. Call this method with attribs = ["case"] to count + # the same person across different cases as different speakers. + # + # By default, if this function is not called, speakers are identified by name only. + # + # :param attribs: Collection of attribute names. + # :type attribs: Collection + # """ + # + # self._split_attribs = set(attribs) + # # self._update_uid() + + def _add_utterance(self, utt): + self.utterances[utt.id] = utt + + def _add_conversation(self, convo): + self.conversations[convo.id] = convo + + def get_utterance(self, ut_id: str): # -> Utterance: + """ + Get the Utterance with the specified Utterance id + :param ut_id: The id of the Utterance + :return: An Utterance object + """ + return self.utterances[ut_id] + + def iter_utterances(self, selector=lambda utt: True): # -> Generator[Utterance, None, None]: + """ + Get utterances made by the Speaker, with an optional selector that selects for Utterances that + should be included. + :param selector: a (lambda) function that takes an Utterance and returns True or False (i.e. include / exclude). + By default, the selector includes all Utterances in the Corpus. + :return: An iterator of the Utterances made by the speaker + """ + for v in self.utterances.values(): + if selector(v): + yield v + + def get_utterances_dataframe(self, selector=lambda utt: True, exclude_meta: bool = False): + """ + Get a DataFrame of the Utterances made by the Speaker with fields and metadata attributes. + Set an optional selector that filters for Utterances that should be included. + Edits to the DataFrame do not change the corpus in any way. + :param exclude_meta: whether to exclude metadata + :param selector: a (lambda) function that takes a Utterance and returns True or False (i.e. include / exclude). + By default, the selector includes all Utterances in the Corpus. + :return: a pandas DataFrame + """ + return get_utterances_dataframe(self, selector, exclude_meta) + + def get_utterance_ids(self, selector=lambda utt: True) -> List[str]: + """ + :return: a List of the ids of Utterances made by the speaker + """ + return list([utt.id for utt in self.iter_utterances(selector)]) + + def get_conversation(self, cid: str): # -> Conversation: + """ + Get the Conversation with the specified Conversation id + :param cid: The id of the Conversation + :return: A Conversation object + """ + return self.conversations[cid] + + def iter_conversations( + self, selector=lambda convo: True + ): # -> Generator[Conversation, None, None]: + """ + :return: An iterator of the Conversations that the speaker has participated in + """ + for v in self.conversations.values(): + if selector(v): + yield v + + def get_conversations_dataframe(self, selector=lambda convo: True, exclude_meta: bool = False): + """ + Get a DataFrame of the Conversations the Speaker has participated in, with fields and metadata attributes. + Set an optional selector that filters for Conversations that should be included. Edits to the DataFrame do not + change the corpus in any way. + :param exclude_meta: whether to exclude metadata + :param selector: a (lambda) function that takes a Conversation and returns True or False (i.e. include / exclude). + By default, the selector includes all Conversations in the Corpus. + :return: a pandas DataFrame + """ + return get_conversations_dataframe(self, selector, exclude_meta) + + def get_conversation_ids(self, selector=lambda convo: True) -> List[str]: + """ + :return: a List of the ids of Conversations started by the speaker + """ + return [convo.id for convo in self.iter_conversations(selector)] + + def print_speaker_stats(self): + """ + Helper function for printing the number of Utterances made and Conversations participated in by the Speaker. + :return: None (prints output) + """ + print("Number of Utterances: {}".format(len(list(self.iter_utterances())))) + print("Number of Conversations: {}".format(len(list(self.iter_conversations())))) + + def __lt__(self, other): + return self.id < other.id + + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, Speaker): + return False + try: + return self.id == other.id + except AttributeError: + return self.__dict__["_name"] == other.__dict__["_name"] + + def __str__(self): + return "Speaker(id: {}, vectors: {}, meta: {})".format( + repr(self.id), self.vectors, self.meta + ) + + +class Utterance(CorpusComponent): + """Represents a single utterance in the dataset. + :param id: the unique id of the utterance. + :param speaker: the speaker giving the utterance. + :param conversation_id: the id of the root utterance of the conversation. + :param reply_to: id of the utterance this was a reply to. + :param timestamp: timestamp of the utterance. Can be any + comparable type. + :param text: text of the utterance. + :ivar id: the unique id of the utterance. + :ivar speaker: the speaker giving the utterance. + :ivar conversation_id: the id of the root utterance of the conversation. + :ivar reply_to: id of the utterance this was a reply to. + :ivar timestamp: timestamp of the utterance. + :ivar text: text of the utterance. + :ivar meta: A dictionary-like view object providing read-write access to + utterance-level metadata. + """ + + def __init__( + self, + owner=None, + id: Optional[str] = None, + speaker: Optional[Speaker] = None, + conversation_id: Optional[str] = None, + reply_to: Optional[str] = None, + timestamp: Optional[int] = None, + text: str = "", + meta: Optional[Dict] = None, + ): + # check arguments that have alternate naming due to backwards compatibility + if speaker is None: + raise ValueError("No Speaker found: Utterance must be initialized with a Speaker.") + + if conversation_id is not None and not isinstance(conversation_id, str): + warn( + "Utterance conversation_id must be a string: conversation_id of utterance with ID: {} " + "has been casted to a string.".format(id) + ) + conversation_id = str(conversation_id) + if not isinstance(text, str): + warn( + "Utterance text must be a string: text of utterance with ID: {} " + "has been casted to a string.".format(id) + ) + text = "" if text is None else str(text) + + props = { + "speaker_id": speaker.id, + "conversation_id": conversation_id, + "reply_to": reply_to, + "timestamp": timestamp, + "text": text, + } + super().__init__(obj_type="utterance", owner=owner, id=id, initial_data=props, meta=meta) + self.speaker_ = speaker + + ############################################################################ + ## directly-accessible class properties (roughly equivalent to keys in the + ## JSON, plus aliases for compatibility) + ############################################################################ + + def _get_speaker(self): + return self.speaker_ + + def _set_speaker(self, val): + self.speaker_ = val + self.set_data("speaker_id", self.speaker.id) + + speaker = property(_get_speaker, _set_speaker) + + def _get_conversation_id(self): + return self.get_data("conversation_id") + + def _set_conversation_id(self, val): + self.set_data("conversation_id", val) + + conversation_id = property(_get_conversation_id, _set_conversation_id) + + def _get_reply_to(self): + return self.get_data("reply_to") + + def _set_reply_to(self, val): + self.set_data("reply_to", val) + + reply_to = property(_get_reply_to, _set_reply_to) + + def _get_timestamp(self): + return self.get_data("timestamp") + + def _set_timestamp(self, val): + self.set_data("timestamp", val) + + timestamp = property(_get_timestamp, _set_timestamp) + + def _get_text(self): + return self.get_data("text") + + def _set_text(self, val): + self.set_data("text", val) + + text = property(_get_text, _set_text) + + ############################################################################ + ## end properties + ############################################################################ + + def get_conversation(self): + """ + Get the Conversation (identified by Utterance.conversation_id) this Utterance belongs to + :return: a Conversation object + """ + return self.owner.get_conversation(self.conversation_id) + + def get_speaker(self): + """ + Get the Speaker that made this Utterance. + :return: a Speaker object + """ + + return self.speaker + + def to_dict(self): + return { + "id": self.id, + "conversation_id": self.conversation_id, + "reply_to": self.reply_to, + "speaker": self.speaker, + "timestamp": self.timestamp, + "text": self.text, + "vectors": self.vectors, + "meta": self.meta if type(self.meta) == dict else self.meta.to_dict(), + } + + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, Utterance): + return False + try: + return ( + self.id == other.id + and ( + self.conversation_id is None + or other.conversation_id is None + or self.conversation_id == other.conversation_id + ) + and self.reply_to == other.reply_to + and self.speaker == other.speaker + and self.timestamp == other.timestamp + and self.text == other.text + ) + except AttributeError: # for backwards compatibility with wikiconv + return self.__dict__ == other.__dict__ + + def __str__(self): + return ( + "Utterance(id: {}, conversation_id: {}, reply-to: {}, " + "speaker: {}, timestamp: {}, text: {}, vectors: {}, meta: {})".format( + repr(self.id), + self.conversation_id, + self.reply_to, + self.speaker, + self.timestamp, + repr(self.text), + self.vectors, + self.meta, + ) + ) + + +class ConvoKitMeta(MutableMapping, dict): + """ + ConvoKitMeta is a dictlike object that stores the metadata attributes of a corpus component + """ + + def __init__(self, owner, convokit_index, obj_type, overwrite=False): + self.owner = owner # Corpus or CorpusComponent + self.index: ConvoKitIndex = convokit_index + self.obj_type = obj_type + + self._get_storage().initialize_data_for_component( + "meta", self.storage_key, overwrite=overwrite + ) + + @property + def storage_key(self) -> str: + return f"{self.obj_type}_{self.owner.id}" + + def __getitem__(self, item): + return self._get_storage().get_data( + "meta", self.storage_key, item, self.index.get_index(self.obj_type) + ) + + def _get_storage(self): + # special case for Corpus meta since that's the only time owner is not a CorpusComponent + # since cannot directly import Corpus to check the type (circular import), as a proxy we + # check for the obj_type attribute which is common to all CorpusComponent but not + # present in Corpus + if not hasattr(self.owner, "obj_type"): + return self.owner.storage + # self.owner -> CorpusComponent + # self.owner.owner -> Corpus that owns the CorpusComponent (only Corpus has direct pointer to storage) + return self.owner.owner.storage + + @staticmethod + def _check_type_and_update_index(index, obj_type, key, value): + if key not in index.indices[obj_type]: + if isinstance(value, type(None)): # new entry with None type means can't infer type yet + index.create_new_index(obj_type, key=key) + else: + type_ = _optimized_type_check(value) + index.update_index(obj_type, key=key, class_type=type_) + else: + # entry exists + if not isinstance(value, type(None)): # do not update index if value is None + if index.get_index(obj_type)[key] != ["bin"]: # if "bin" do no further checks + if str(type(value)) not in index.get_index(obj_type)[key]: + new_type = _optimized_type_check(value) + + if new_type == "bin": + index.set_index(obj_type, key, "bin") + else: + index.update_index(obj_type, key, new_type) + + def __setitem__(self, key, value): + if not isinstance(key, str): + warn("Metadata attribute keys must be strings. Input key has been casted to a string.") + key = str(key) + + if self.index.type_check: + ConvoKitMeta._check_type_and_update_index(self.index, self.obj_type, key, value) + self._get_storage().update_data( + "meta", self.storage_key, key, value, self.index.get_index(self.obj_type) + ) + + def __delitem__(self, key): + if self.obj_type == "corpus": + self.index.del_from_index(self.obj_type, key) + self._get_storage().delete_data("meta", self.storage_key, key) + else: + if self.index.lock_metadata_deletion[self.obj_type]: + warn( + "For consistency in metadata attributes in Corpus component objects, deleting metadata attributes " + "from component objects individually is not allowed. " + "To delete this metadata attribute from all Corpus components of this type, " + "use corpus.delete_metadata(obj_type='{}', attribute='{}') instead.".format( + self.obj_type, key + ) + ) + else: + self._get_storage().delete_data("meta", self.storage_key, key) + + def __iter__(self): + return ( + self._get_storage() + .get_data("meta", self.storage_key, index=self.index.get_index(self.obj_type)) + .__iter__() + ) + + def __len__(self): + return ( + self._get_storage() + .get_data("meta", self.storage_key, index=self.index.get_index(self.obj_type)) + .__len__() + ) + + def __contains__(self, x): + return ( + self._get_storage() + .get_data("meta", self.storage_key, index=self.index.get_index(self.obj_type)) + .__contains__(x) + ) + + def __repr__(self) -> str: + return "ConvoKitMeta(" + self.to_dict().__repr__() + ")" + + def to_dict(self): + return dict( + self._get_storage().get_data( + "meta", self.storage_key, index=self.index.get_index(self.obj_type) + ) + ) + + def reinitialize_from(self, other: Union["ConvoKitMeta", dict]): + """ + Reinitialize this ConvoKitMeta instance with the data from other + """ + if isinstance(other, ConvoKitMeta): + other = {k: v for k, v in other.to_dict().items()} + elif not isinstance(other, dict): + raise TypeError( + "ConvoKitMeta can only be reinitialized from a dict instance or another ConvoKitMeta" + ) + self._get_storage().initialize_data_for_component( + "meta", self.storage_key, overwrite=True, initial_value=other + ) + + +# _basic_types = {type(0), type(1.0), type("str"), type(True)} # cannot include lists or dicts + + +# def _optimized_type_check(val): +# # if type(obj) +# if type(val) in _basic_types: +# return str(type(val)) +# else: +# try: +# json.dumps(val) +# return str(type(val)) +# except (TypeError, OverflowError): +# return "bin" + +DEFAULT_CONFIG_CONTENTS = ( + "# Default Storage Parameters\n" + "db_host: localhost:27017\n" + "data_directory: ~/.convokit/saved-corpora\n" + "default_storage_mode: mem" +) + +ENV_VARS = {"db_host": "CONVOKIT_DB_HOST", "default_storage_mode": "CONVOKIT_STORAGE_MODE"} + + +class ConvoKitConfig: + """ + Utility class providing read-only access to the ConvoKit config file + """ + + def __init__(self, filename: Optional[str] = None): + if filename is None: + filename = os.path.expanduser("~/.convokit/config.yml") + + if not os.path.isfile(filename): + convo_dir = os.path.dirname(filename) + if not os.path.isdir(convo_dir): + os.makedirs(convo_dir) + with open(filename, "w") as f: + print( + f"No configuration file found at {filename}; writing with contents: \n{DEFAULT_CONFIG_CONTENTS}" + ) + f.write(DEFAULT_CONFIG_CONTENTS) + self.config_contents = load(DEFAULT_CONFIG_CONTENTS, Loader=Loader) + else: + with open(filename, "r") as f: + self.config_contents = load(f.read(), Loader=Loader) + + def _get_config_from_env_or_file(self, config_key: str, default_val): + env_val = os.environ.get(ENV_VARS[config_key], None) + if env_val is not None: + # environment variable setting takes priority + return env_val + return self.config_contents.get(config_key, default_val) + + @property + def db_host(self): + return self._get_config_from_env_or_file("db_host", "localhost:27017") + + @property + def data_directory(self): + return self.config_contents.get("data_directory", "~/.convokit/saved-corpora") + + @property + def default_storage_mode(self): + return self._get_config_from_env_or_file("default_storage_mode", "mem") + +class Conversation(CorpusComponent): + """ + Represents a discrete subset of utterances in the dataset, connected by a reply-to chain. + :param owner: The Corpus that this Conversation belongs to + :param id: The unique ID of this Conversation + :param utterances: A list of the IDs of the Utterances in this Conversation + :param meta: Table of initial values for conversation-level metadata + :ivar id: the ID of the Conversation + :ivar meta: A dictionary-like view object providing read-write access to + conversation-level metadata. + """ + + def __init__( + self, + owner, + id: Optional[str] = None, + utterances: Optional[List[str]] = None, + meta: Optional[Dict] = None, + ): + super().__init__(obj_type="conversation", owner=owner, id=id, meta=meta) + self._owner = owner + self._utterance_ids: List[str] = utterances + self._speaker_ids = None + self.tree: Optional[UtteranceNode] = None + + def _add_utterance(self, utt: Utterance): + self._utterance_ids.append(utt.id) + self._speaker_ids = None + self.tree = None + + def get_utterance_ids(self) -> List[str]: + """Produces a list of the unique IDs of all utterances in the + Conversation, which can be used in calls to get_utterance() to retrieve + specific utterances. Provides no ordering guarantees for the list. + :return: a list of IDs of Utterances in the Conversation + """ + # pass a copy of the list + return self._utterance_ids[:] + + def get_utterance(self, ut_id: str) -> Utterance: + """Looks up the Utterance associated with the given ID. Raises a + KeyError if no utterance by that ID exists. + :return: the Utterance with the given ID + """ + # delegate to the owner Corpus since Conversation does not itself own + # any Utterances + return self._owner.get_utterance(ut_id) + + def iter_utterances( + self, selector: Callable[[Utterance], bool] = lambda utt: True + ) -> Generator[Utterance, None, None]: + """ + Get utterances in the Corpus, with an optional selector that filters for Utterances that should be included. + :param selector: a (lambda) function that takes an Utterance and returns True or False (i.e. include / exclude). + By default, the selector includes all Utterances in the Conversation. + :return: a generator of Utterances + """ + for ut_id in self._utterance_ids: + utt = self._owner.get_utterance(ut_id) + if selector(utt): + yield utt + + def get_utterances_dataframe( + self, selector: Callable[[Utterance], bool] = lambda utt: True, exclude_meta: bool = False + ): + """ + Get a DataFrame of the Utterances in the Conversation with fields and metadata attributes. + Set an optional selector that filters for Utterances that should be included. + Edits to the DataFrame do not change the corpus in any way. + :param selector: a (lambda) function that takes an Utterance and returns True or False (i.e. include / exclude). + By default, the selector includes all Utterances in the Conversation. + :param exclude_meta: whether to exclude metadata + :return: a pandas DataFrame + """ + return get_utterances_dataframe(self, selector, exclude_meta) + + def get_speaker_ids(self) -> List[str]: + """ + Produces a list of ids of all speakers in the Conversation, which can be used in calls to get_speaker() + to retrieve specific speakers. Provides no ordering guarantees for the list. + :return: a list of speaker ids + """ + if self._speaker_ids is None: + # first call to get_speaker_ids or iter_speakers; precompute cached list of speaker ids + self._speaker_ids = set() + for ut_id in self._utterance_ids: + ut = self._owner.get_utterance(ut_id) + self._speaker_ids.add(ut.speaker.id) + return list(self._speaker_ids) + + def get_speaker(self, speaker_id: str) -> Speaker: + """ + Looks up the Speaker with the given name. Raises a KeyError if no speaker + with that name exists. + :return: the Speaker with the given speaker_id + """ + # delegate to the owner Corpus since Conversation does not itself own + # any Utterances + return self._owner.get_speaker(speaker_id) + + def iter_speakers( + self, selector: Callable[[Speaker], bool] = lambda speaker: True + ) -> Generator[Speaker, None, None]: + """ + Get Speakers that have participated in the Conversation, with an optional selector that filters for Speakers + that should be included. + :param selector: a (lambda) function that takes a Speaker and returns True or False (i.e. include / exclude). + By default, the selector includes all Speakers in the Conversation. + :return: a generator of Speakers + """ + if self._speaker_ids is None: + # first call to get_ids or iter_speakers; precompute cached list of speaker ids + self._speaker_ids = set() + for ut_id in self._utterance_ids: + ut = self._owner.get_utterance(ut_id) + self._speaker_ids.add(ut.speaker.id) + for speaker_id in self._speaker_ids: + speaker = self._owner.get_speaker(speaker_id) + if selector(speaker): + yield speaker + + def get_speakers_dataframe( + self, + selector: Optional[Callable[[Speaker], bool]] = lambda utt: True, + exclude_meta: bool = False, + ): + """ + Get a DataFrame of the Speakers that have participated in the Conversation with fields and metadata attributes, + with an optional selector that filters Speakers that should be included. + Edits to the DataFrame do not change the corpus in any way. + :param exclude_meta: whether to exclude metadata + :param selector: selector: a (lambda) function that takes a Speaker and returns True or False + (i.e. include / exclude). By default, the selector includes all Speakers in the Conversation. + :return: a pandas DataFrame + """ + return get_speakers_dataframe(self, selector, exclude_meta) + + def print_conversation_stats(self): + """ + Helper function for printing the number of Utterances and Spekaers in the Conversation. + :return: None (prints output) + """ + print("Number of Utterances: {}".format(len(list(self.iter_utterances())))) + print("Number of Speakers: {}".format(len(list(self.iter_speakers())))) + + def get_chronological_speaker_list( + self, selector: Callable[[Speaker], bool] = lambda speaker: True + ): + """ + Get the speakers in the conversation sorted in chronological order (speakers may appear more than once) + :param selector: (lambda) function for which speakers should be included; all speakers are included by default + :return: list of speakers for each chronological utterance + """ + try: + chrono_utts = sorted(list(self.iter_utterances()), key=lambda utt: utt.timestamp) + return [utt.speaker for utt in chrono_utts if selector(utt.speaker)] + except TypeError as e: + raise ValueError(str(e) + "\nUtterance timestamps may not have been set correctly.") + + def check_integrity(self, verbose: bool = True) -> bool: + """ + Check the integrity of this Conversation; i.e. do the constituent utterances form a complete reply-to chain? + :param verbose: whether to print errors indicating the problems with the Conversation + :return: True if the conversation structure is complete else False + """ + if verbose: + print("Checking reply-to chain of Conversation", self.id) + utt_reply_tos = {utt.id: utt.reply_to for utt in self.iter_utterances()} + target_utt_ids = set(list(utt_reply_tos.values())) + speaker_utt_ids = set(list(utt_reply_tos.keys())) + root_utt_id = target_utt_ids - speaker_utt_ids # There should only be 1 root_utt_id: None + + if len(root_utt_id) != 1: + if verbose: + for utt_id in root_utt_id: + if utt_id is not None: + warn("ERROR: Missing utterance {}".format(utt_id)) + return False + else: + root_id = list(root_utt_id)[0] + if root_id is not None: + if verbose: + warn("ERROR: Missing utterance {}".format(root_id)) + return False + + # sanity check + utts_replying_to_none = 0 + for utt in self.iter_utterances(): + if utt.reply_to is None: + utts_replying_to_none += 1 + + if utts_replying_to_none > 1: + if verbose: + warn("ERROR: Found more than one Utterance replying to None.") + return False + + circular = [ + utt_id for utt_id, utt_reply_to in utt_reply_tos.items() if utt_id == utt_reply_to + ] + if len(circular) > 0: + if verbose: + warn( + "ERROR: Found utterances with .reply_to pointing to themselves: {}".format( + circular + ) + ) + return False + + if verbose: + print("No issues found.\n") + return True + + def initialize_tree_structure(self): + if not self.check_integrity(verbose=False): + raise ValueError( + "Conversation {} reply-to chain does not form a valid tree.".format(self.id) + ) + + root_node_id = None + # Find root node + for utt in self.iter_utterances(): + if utt.reply_to is None: + root_node_id = utt.id + + parent_to_children_ids = defaultdict(list) + for utt in self.iter_utterances(): + parent_to_children_ids[utt.reply_to].append(utt.id) + + wrapped_utts = {utt.id: UtteranceNode(utt) for utt in self.iter_utterances()} + + for parent_id, wrapped_utt in wrapped_utts.items(): + wrapped_utt.set_children( + [wrapped_utts[child_id] for child_id in parent_to_children_ids[parent_id]] + ) + + self.tree = wrapped_utts[root_node_id] + + def traverse(self, traversal_type: str, as_utterance: bool = True): + """ + Traverse through the Conversation tree structure in a breadth-first search ('bfs'), depth-first search (dfs), + pre-order ('preorder'), or post-order ('postorder') way. + :param traversal_type: dfs, bfs, preorder, or postorder + :param as_utterance: whether the iterator should yield the utterance (True) or the utterance node (False) + :return: an iterator of the utterances or utterance nodes + """ + if self.tree is None: + self.initialize_tree_structure() + if self.tree is None: + raise ValueError( + "Failed to traverse because Conversation reply-to chain does not form a valid tree." + ) + + traversals = { + "bfs": self.tree.bfs_traversal, + "dfs": self.tree.dfs_traversal, + "preorder": self.tree.pre_order, + "postorder": self.tree.post_order, + } + + for utt_node in traversals[traversal_type](): + yield utt_node.utt if as_utterance else utt_node + + def get_subtree(self, root_utt_id): + """ + Get the utterance node of the specified input id + :param root_utt_id: id of the root node that the subtree starts from + :return: UtteranceNode object + """ + if self.tree is None: + self.initialize_tree_structure() + if self.tree is None: + raise ValueError( + "Failed to traverse because Conversation reply-to chain does not form a valid tree." + ) + + for utt_node in self.tree.bfs_traversal(): + if utt_node.utt.id == root_utt_id: + return utt_node + + def get_longest_paths(self) -> List[List[Utterance]]: + """ + Finds the Utterances form the longest path (i.e. root to leaf) in the Conversation tree. + If there are multiple paths with tied lengths, returns all of them as a list of lists. If only one such path + exists, a list containing a single list of Utterances is returned. + :return: a list of lists of Utterances + """ + if self.tree is None: + self.initialize_tree_structure() + if self.tree is None: + raise ValueError( + "Failed to traverse because Conversation reply-to chain does not form a valid tree." + ) + + paths = self.get_root_to_leaf_paths() + max_len = max(len(path) for path in paths) + + return [p for p in paths if len(p) == max_len] + + def _print_convo_helper( + self, + root: str, + indent: int, + reply_to_dict: Dict[str, str], + utt_info_func: Callable[[Utterance], str], + limit=None, + ) -> None: + """ + Helper function for print_conversation_structure() + """ + if limit is not None: + if self.get_utterance(root).meta["order"] > limit: + return + print(" " * indent + utt_info_func(self.get_utterance(root))) + children_utt_ids = [k for k, v in reply_to_dict.items() if v == root] + for child_utt_id in children_utt_ids: + self._print_convo_helper( + root=child_utt_id, + indent=indent + 4, + reply_to_dict=reply_to_dict, + utt_info_func=utt_info_func, + limit=limit, + ) + + def print_conversation_structure( + self, + utt_info_func: Callable[[Utterance], str] = lambda utt: utt.speaker.id, + limit: int = None, + ) -> None: + """ + Prints an indented representation of utterances in the Conversation with conversation reply-to structure + determining the indented level. The details of each utterance to be printed can be configured. + If limit is set to a value other than None, this will annotate utterances with an 'order' metadata indicating + their temporal order in the conversation, where the first utterance in the conversation is annotated with 1. + :param utt_info_func: callable function taking an utterance as input and returning a string of the desired + utterance information. By default, this is a lambda function returning the utterance's speaker's id + :param limit: maximum number of utterances to print out. if k, this includes the first k utterances. + :return: None. Prints to stdout. + """ + if not self.check_integrity(verbose=False): + raise ValueError( + "Could not print conversation structure: The utterance reply-to chain is broken. " + "Try check_integrity() to diagnose the problem." + ) + + if limit is not None: + assert isinstance(limit, int) + for idx, utt in enumerate(self.get_chronological_utterance_list()): + utt.meta["order"] = idx + 1 + + root_utt_id = [utt for utt in self.iter_utterances() if utt.reply_to is None][0].id + reply_to_dict = {utt.id: utt.reply_to for utt in self.iter_utterances()} + + self._print_convo_helper( + root=root_utt_id, + indent=0, + reply_to_dict=reply_to_dict, + utt_info_func=utt_info_func, + limit=limit, + ) + + def get_utterances_dataframe(self, selector=lambda utt: True, exclude_meta: bool = False): + """ + Get a DataFrame of the Utterances in the COnversation with fields and metadata attributes. + Set an optional selector that filters Utterances that should be included. + Edits to the DataFrame do not change the corpus in any way. + :param exclude_meta: whether to exclude metadata + :param selector: a (lambda) function that takes a Utterance and returns True or False (i.e. include / exclude). + By default, the selector includes all Utterances in the Conversation. + :return: a pandas DataFrame + """ + return get_utterances_dataframe(self, selector, exclude_meta) + + def get_chronological_utterance_list( + self, selector: Callable[[Utterance], bool] = lambda utt: True + ): + """ + Get the utterances in the conversation sorted in increasing order of timestamp + :param selector: function for which utterances should be included; all utterances are included by default + :return: list of utterances, sorted by timestamp + """ + try: + return sorted( + [utt for utt in self.iter_utterances(selector)], key=lambda utt: utt.timestamp + ) + except TypeError as e: + raise ValueError(str(e) + "\nUtterance timestamps may not have been set correctly.") + + def _get_path_from_leaf_to_root( + self, leaf_utt: Utterance, root_utt: Utterance + ) -> List[Utterance]: + """ + Helper function for get_root_to_leaf_paths, which returns the path for a given leaf_utt and root_utt + """ + if leaf_utt == root_utt: + return [leaf_utt] + path = [leaf_utt] + root_id = root_utt.id + while leaf_utt.reply_to != root_id: + path.append(self.get_utterance(leaf_utt.reply_to)) + leaf_utt = path[-1] + path.append(root_utt) + return path[::-1] + + def get_root_to_leaf_paths(self) -> List[List[Utterance]]: + """ + Get the paths (stored as a list of lists of utterances) from the root to each of the leaves + in the conversational tree + :return: List of lists of Utterances + """ + if not self.check_integrity(verbose=False): + raise ValueError( + "Conversation failed integrity check. " + "It is either missing an utterance in the reply-to chain and/or has multiple root nodes. " + "Run check_integrity() to diagnose issues." + ) + + utt_reply_tos = {utt.id: utt.reply_to for utt in self.iter_utterances()} + target_utt_ids = set(list(utt_reply_tos.values())) + speaker_utt_ids = set(list(utt_reply_tos.keys())) + root_utt_id = target_utt_ids - speaker_utt_ids # There should only be 1 root_utt_id: None + assert len(root_utt_id) == 1 + root_utt = [utt for utt in self.iter_utterances() if utt.reply_to is None][0] + leaf_utt_ids = speaker_utt_ids - target_utt_ids + + paths = [ + self._get_path_from_leaf_to_root(self.get_utterance(leaf_utt_id), root_utt) + for leaf_utt_id in leaf_utt_ids + ] + return paths + + @staticmethod + def generate_default_conversation_id(utterance_id): + return f"__default_conversation__{utterance_id}" + + def __hash__(self): + return super().__hash__() + + def __eq__(self, other): + if not isinstance(other, Conversation): + return False + return self.id == other.id and set(self._utterance_ids) == set(other._utterance_ids) + + def __str__(self): + return "Conversation('id': {}, 'utterances': {}, 'meta': {})".format( + repr(self.id), self._utterance_ids, self.meta + ) + +class Corpus: + """ + Represents a dataset, which can be loaded from a folder or constructed from a list of utterances. + :param filename: Path to a folder containing a Corpus or to an utterances.jsonl / utterances.json file to load + :param utterances: list of utterances to initialize Corpus from + :param preload_vectors: list of names of vectors to be preloaded from directory; by default, + no vectors are loaded but can be loaded any time after corpus initialization (i.e. vectors are lazy-loaded). + :param utterance_start_index: if loading from directory and the corpus folder contains utterances.jsonl, specify the + line number (zero-indexed) to begin parsing utterances from + :param utterance_end_index: if loading from directory and the corpus folder contains utterances.jsonl, specify the + line number (zero-indexed) of the last utterance to be parsed. + :param merge_lines: whether to merge adjacent lines from same speaker if multiple consecutive utterances belong to + the same conversation. + :param exclude_utterance_meta: utterance metadata to be ignored + :param exclude_conversation_meta: conversation metadata to be ignored + :param exclude_speaker_meta: speaker metadata to be ignored + :param exclude_overall_meta: overall metadata to be ignored + :param disable_type_check: whether to do type checking when loading the Corpus from a directory. + Type-checking ensures that the ConvoKitIndex is initialized correctly. However, it may be unnecessary if the + index.json is already accurate and disabling it will allow for a faster corpus load. This parameter is set to + True by default, i.e. type-checking is not carried out. + :ivar meta_index: index of Corpus metadata + :ivar vectors: the vectors stored in the Corpus + :ivar corpus_dirpath: path to the directory the corpus was loaded from + """ + + def __init__( + self, + filename: Optional[str] = None, + utterances: Optional[List[Utterance]] = None, + db_collection_prefix: Optional[str] = None, + db_host: Optional[str] = None, + preload_vectors: List[str] = None, + utterance_start_index: int = None, + utterance_end_index: int = None, + merge_lines: bool = False, + exclude_utterance_meta: Optional[List[str]] = None, + exclude_conversation_meta: Optional[List[str]] = None, + exclude_speaker_meta: Optional[List[str]] = None, + exclude_overall_meta: Optional[List[str]] = None, + disable_type_check=True, + storage_type: Optional[str] = None, + storage: Optional[StorageManager] = None, + ): + self.config = ConvoKitConfig() + self.corpus_dirpath = get_corpus_dirpath(filename) + + # configure corpus ID (optional for mem mode, required for DB mode) + if storage_type is None: + storage_type = self.config.default_storage_mode + if db_collection_prefix is None and filename is None and storage_type == "db": + db_collection_prefix = create_safe_id() + warn( + "You are in DB mode, but no collection prefix was specified and no filename was given from which to infer one." + "Will use a randomly generated unique prefix " + db_collection_prefix + ) + self.id = get_corpus_id(db_collection_prefix, filename, storage_type) + self.storage_type = storage_type + self.storage = initialize_storage(self, storage, storage_type, db_host) + + self.meta_index = ConvoKitIndex(self) + self.meta = ConvoKitMeta(self, self.meta_index, "corpus") + + # private storage + self._vector_matrices = dict() + + convos_data = defaultdict(dict) + if exclude_utterance_meta is None: + exclude_utterance_meta = [] + if exclude_conversation_meta is None: + exclude_conversation_meta = [] + if exclude_speaker_meta is None: + exclude_speaker_meta = [] + if exclude_overall_meta is None: + exclude_overall_meta = [] + + if filename is not None and storage_type == "db": + # JSON-to-DB construction mode uses a specialized code branch, which + # optimizes for this use case by using direct batch insertions into the + # DB rather than going through the StorageManager, hence improving + # efficiency. + + with open(os.path.join(filename, "index.json"), "r") as f: + idx_dict = json.load(f) + self.meta_index.update_from_dict(idx_dict) + + # populate the DB with the contents of the source file + ids_in_db = populate_db_from_file( + filename, + self.storage.db, + self.id, + self.meta_index, + utterance_start_index, + utterance_end_index, + exclude_utterance_meta, + exclude_conversation_meta, + exclude_speaker_meta, + exclude_overall_meta, + ) + + # with the StorageManager's DB now populated, initialize the corresponding + # CorpusComponent instances. + init_corpus_from_storage_manager(self, ids_in_db) + + self.meta_index.enable_type_check() + # load preload_vectors + if preload_vectors is not None: + for vector_name in preload_vectors: + matrix = ConvoKitMatrix.from_dir(self.corpus_dirpath, vector_name) + if matrix is not None: + self._vector_matrices[vector_name] = matrix + + if merge_lines: + self.utterances = merge_utterance_lines(self.utterances) + else: + # Construct corpus from file or directory + if filename is not None: + if disable_type_check: + self.meta_index.disable_type_check() + if os.path.isdir(filename): + utterances = load_utterance_info_from_dir( + filename, utterance_start_index, utterance_end_index, exclude_utterance_meta + ) + + speakers_data = load_speakers_data_from_dir(filename, exclude_speaker_meta) + convos_data = load_convos_data_from_dir(filename, exclude_conversation_meta) + load_corpus_meta_from_dir(filename, self.meta, exclude_overall_meta) + + with open(os.path.join(filename, "index.json"), "r") as f: + idx_dict = json.load(f) + self.meta_index.update_from_dict(idx_dict) + + # unpack all binary data + unpack_all_binary_data( + filename=filename, + meta_index=self.meta_index, + meta=self.meta, + utterances=utterances, + speakers_data=speakers_data, + convos_data=convos_data, + exclude_utterance_meta=exclude_utterance_meta, + exclude_speaker_meta=exclude_speaker_meta, + exclude_conversation_meta=exclude_conversation_meta, + exclude_overall_meta=exclude_overall_meta, + ) + + else: + speakers_data = defaultdict(dict) + convos_data = defaultdict(dict) + utterances = load_from_utterance_file( + filename, utterance_start_index, utterance_end_index + ) + + self.utterances = dict() + self.speakers = dict() + + initialize_speakers_and_utterances_objects(self, utterances, speakers_data) + + self.meta_index.enable_type_check() + + # load preload_vectors + if preload_vectors is not None: + for vector_name in preload_vectors: + matrix = ConvoKitMatrix.from_dir(self.corpus_dirpath, vector_name) + if matrix is not None: + self._vector_matrices[vector_name] = matrix + + elif utterances is not None: # Construct corpus from utterances list + self.speakers = {utt.speaker.id: utt.speaker for utt in utterances} + self.utterances = {utt.id: utt for utt in utterances} + for speaker in self.speakers.values(): + speaker.owner = self + for utt in self.utterances.values(): + utt.owner = self + + if merge_lines: + self.utterances = merge_utterance_lines(self.utterances) + + if disable_type_check: + self.meta_index.disable_type_check() + # if corpus is nonempty (check for self.utterances), construct the conversation + # data from the utterance list + if hasattr(self, "utterances"): + self.conversations = initialize_conversations( + self, convos_data, fill_missing_convo_ids=True + ) + self.meta_index.enable_type_check() + self.update_speakers_data() + + def update_speakers_data(self) -> None: + """ + Updates the conversation and utterance lists of every Speaker in the Corpus + :return: None + """ + speakers_utts = defaultdict(list) + speakers_convos = defaultdict(list) + + for utt in self.iter_utterances(): + speakers_utts[utt.speaker.id].append(utt) + + for convo in self.iter_conversations(): + for utt in convo.iter_utterances(): + speakers_convos[utt.speaker.id].append(convo) + + for speaker in self.iter_speakers(): + speaker.utterances = {utt.id: utt for utt in speakers_utts[speaker.id]} + speaker.conversations = {convo.id: convo for convo in speakers_convos[speaker.id]} + + def iter_utterances( + self, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True + ) -> Generator[Utterance, None, None]: + """ + Get utterances in the Corpus, with an optional selector that filters for Utterances that should be included. + :param selector: a (lambda) function that takes an Utterance and returns True or False (i.e. include / exclude). + By default, the selector includes all Utterances in the Corpus. + :return: a generator of Utterances + """ + for v in self.utterances.values(): + if selector(v): + yield v + + def iter_conversations( + self, selector: Optional[Callable[[Conversation], bool]] = lambda convo: True + ) -> Generator[Conversation, None, None]: + """ + Get conversations in the Corpus, with an optional selector that filters for Conversations that should be included + :param selector: a (lambda) function that takes a Conversation and returns True or False (i.e. include / exclude). + By default, the selector includes all Conversations in the Corpus. + :return: a generator of Conversations + """ + for v in self.conversations.values(): + if selector(v): + yield v + + def get_utterance(self, utt_id: str) -> Utterance: + """ + Gets Utterance of the specified id from the corpus + :param utt_id: id of Utterance + :return: Utterance + """ + return self.utterances[utt_id] + + def get_conversation(self, convo_id: str) -> Conversation: + """ + Gets Conversation of the specified id from the corpus + :param convo_id: id of Conversation + :return: Conversation + """ + return self.conversations[convo_id] + + def get_speaker(self, speaker_id: str) -> Speaker: + """ + Gets Speaker of the specified id from the corpus + :param speaker_id: id of Speaker + :return: Speaker + """ + return self.speakers[speaker_id] + + def get_object(self, obj_type: str, oid: str): + """ + General Corpus object getter. Gets Speaker / Utterance / Conversation of specified id from the Corpus + :param obj_type: "speaker", "utterance", or "conversation" + :param oid: object id + :return: Corpus object of specified object type with specified object id + """ + assert obj_type in ["speaker", "utterance", "conversation"] + if obj_type == "speaker": + return self.get_speaker(oid) + elif obj_type == "utterance": + return self.get_utterance(oid) + else: + return self.get_conversation(oid) + + def iter_speakers( + self, selector: Optional[Callable[[Speaker], bool]] = lambda speaker: True + ) -> Generator[Speaker, None, None]: + """ + Get Speakers in the Corpus, with an optional selector that filters for Speakers that should be included + :param selector: a (lambda) function that takes a Speaker and returns True or False (i.e. include / exclude). + By default, the selector includes all Speakers in the Corpus. + :return: a generator of Speakers + """ + + for speaker in self.speakers.values(): + if selector(speaker): + yield speaker + + @property + def vectors(self): + return self.meta_index.vectors + + @vectors.setter + def vectors(self, new_vectors): + if not isinstance(new_vectors, type(["stringlist"])): + raise ValueError( + "The preload_vectors being set should be a list of strings, " + "where each string is the name of a vector matrix." + ) + self.meta_index.vectors = new_vectors + + def dump( + self, + name: str, + base_path: Optional[str] = None, + exclude_vectors: List[str] = None, + force_version: int = None, + overwrite_existing_corpus: bool = False, + fields_to_skip=None, + ) -> None: + """ + Dumps the corpus and its metadata to disk. Optionally, set `force_version` to a desired integer version number, + otherwise the version number is automatically incremented. + :param name: name of corpus + :param base_path: base directory to save corpus in (None to save to a default directory) + :param exclude_vectors: list of names of vector matrices to exclude from the dumping step. By default; all + vector matrices that belong to the Corpus (whether loaded or not) are dumped. + :param force_version: version number to set for the dumped corpus + :param overwrite_existing_corpus: if True, save to the path you loaded the corpus from, overriding the original corpus. + :param fields_to_skip: a dictionary of {object type: list of metadata attributes to omit when writing to disk}. object types can be one of "speaker", "utterance", "conversation", "corpus". + """ + if fields_to_skip is None: + fields_to_skip = dict() + dir_name = name + if base_path is not None and overwrite_existing_corpus: + raise ValueError("Not allowed to specify both base_path and overwrite_existing_corpus!") + if overwrite_existing_corpus and self.corpus_dirpath is None: + raise ValueError( + "Cannot use save to existing path on Corpus generated from utterance list!" + ) + if not overwrite_existing_corpus: + if base_path is None: + base_path = os.path.expanduser("~/.convokit/") + if not os.path.exists(base_path): + os.mkdir(base_path) + base_path = os.path.join(base_path, "saved-corpora/") + if not os.path.exists(base_path): + os.mkdir(base_path) + dir_name = os.path.join(base_path, dir_name) + else: + dir_name = os.path.join(self.corpus_dirpath) + + if not os.path.exists(dir_name): + os.mkdir(dir_name) + + # dump speakers, conversations, utterances + dump_corpus_component( + self, dir_name, "speakers.json", "speaker", "speaker", exclude_vectors, fields_to_skip + ) + dump_corpus_component( + self, + dir_name, + "conversations.json", + "conversation", + "convo", + exclude_vectors, + fields_to_skip, + ) + dump_utterances(self, dir_name, exclude_vectors, fields_to_skip) + + # dump corpus + with open(os.path.join(dir_name, "corpus.json"), "w") as f: + d_bin = defaultdict(list) + meta_up = dump_helper_bin(self.meta, d_bin, fields_to_skip.get("corpus", None)) + + json.dump(meta_up, f) + for name, l_bin in d_bin.items(): + with open(os.path.join(dir_name, name + "-overall-bin.p"), "wb") as f_pk: + pickle.dump(l_bin, f_pk) + + # dump index + with open(os.path.join(dir_name, "index.json"), "w") as f: + json.dump( + self.meta_index.to_dict( + exclude_vectors=exclude_vectors, force_version=force_version + ), + f, + ) + + # dump vectors + if exclude_vectors is not None: + vectors_to_dump = [v for v in self.vectors if v not in set(exclude_vectors)] + else: + vectors_to_dump = self.vectors + for vector_name in vectors_to_dump: + if vector_name in self._vector_matrices: + self._vector_matrices[vector_name].dump(dir_name) + else: + src = os.path.join(self.corpus_dirpath, "vectors.{}.p".format(vector_name)) + dest = os.path.join(dir_name, "vectors.{}.p".format(vector_name)) + shutil.copy(src, dest) + + # with open(os.path.join(dir_name, "processed_text.index.json"), "w") as f: + # json.dump(list(self.processed_text.keys()), f) + + def get_object_ids( + self, + obj_type: str, + selector: Callable[[Union[Speaker, Utterance, Conversation]], bool] = lambda obj: True, + ): + """ + Get a list of ids of Corpus objects of the specified type in the Corpus, with an optional selector that filters for objects that should be included + :param obj_type: "speaker", "utterance", or "conversation" + :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). + By default, the selector includes all objects of the specified type in the Corpus. + :return: list of Corpus object ids + """ + assert obj_type in ["speaker", "utterance", "conversation"] + return [obj.id for obj in self.iter_objs(obj_type, selector)] + + def iter_objs( + self, + obj_type: str, + selector: Callable[[Union[Speaker, Utterance, Conversation]], bool] = lambda obj: True, + ): + """ + Get Corpus objects of specified type from the Corpus, with an optional selector that filters for Corpus object that should be included + :param obj_type: "speaker", "utterance", or "conversation" + :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). + By default, the selector includes all objects of the specified type in the Corpus. + :return: a generator of Speakers + """ + + assert obj_type in ["speaker", "utterance", "conversation"] + obj_iters = { + "conversation": self.iter_conversations, + "speaker": self.iter_speakers, + "utterance": self.iter_utterances, + } + + return obj_iters[obj_type](selector) + + @staticmethod + def from_pandas( + utterances_df: DataFrame, + speakers_df: Optional[DataFrame] = None, + conversations_df: Optional[DataFrame] = None, + ) -> "Corpus": + """ + Generates a Corpus from utterances, speakers, and conversations dataframes. + For each dataframe, if the 'id' column is absent, the dataframe index will be used as the id. + Metadata should be denoted with a 'meta.' column in the dataframe. For example, if an utterance is to have + a metadata key 'score', then the 'meta.score' column must be present in dataframe. + `speakers_df` and `conversations_df` are optional, as their IDs can be inferred from `utterances_df`, and so + their main purpose is to hold speaker / conversation metadata. They should only be included if there exists + metadata for the speakers / conversations respectively. + Metadata values that are not basic Python data structures (i.e. lists, dicts, tuples) may be included in the + dataframes but may lead to unexpected behavior, depending on how `pandas` serializes / deserializes those values. + Note that as metadata can be added to the Corpus after it is constructed, there is no need to include all + metadata keys in the dataframe if it would be inconvenient. + :param utterances_df: utterances data in a pandas Dataframe, all primary data fields expected, with metadata optional + :param speakers_df: (optional) speakers data in a pandas Dataframe + :param conversations_df: (optional) conversations data in a pandas Dataframe + :return: Corpus constructed from the dataframe(s) + """ + columns = ["speaker", "id", "timestamp", "conversation_id", "reply_to", "text"] + + for df_type, df in [ + ("utterances", utterances_df), + ("conversations", conversations_df), + ("speakers", speakers_df), + ]: + if df is None: + continue + if "id" not in df.columns: + print( + f"ID column is not present in {df_type} dataframe, generated ID column from dataframe index..." + ) + df["id"] = df.index + + # checking if dataframes contain their respective required columns + assert ( + pd.Series(columns).isin(utterances_df.columns).all() + ), "Utterances dataframe must contain all primary data fields" + + utterance_meta_cols = extract_meta_from_df(utterances_df) + + utterance_list = [] + for index, row in tqdm(utterances_df.iterrows()): + if utterance_meta_cols: + metadata = {} + for meta_col in utterance_meta_cols: + metadata[meta_col] = row["meta." + meta_col] + else: + metadata = None + + # adding utterance in utterance list + reply_to = None if row["reply_to"] == "None" else row["reply_to"] + utterance_list.append( + Utterance( + id=str(row["id"]), + speaker=Speaker(id=str(row["speaker"])), + conversation_id=str(row["conversation_id"]), + reply_to=reply_to, + timestamp=row["timestamp"], + text=row["text"], + meta=metadata, + ) + ) + + # initializing corpus using utterance_list + corpus = Corpus(utterances=utterance_list) + if speakers_df is not None: + corpus.update_metadata_from_df("speaker", speakers_df) + if conversations_df is not None: + corpus.update_metadata_from_df("conversation", conversations_df) + + return corpus + +class MemStorageManager(StorageManager): + """ + Concrete StorageManager implementation for in-memory data storage. + Collections are implemented as vanilla Python dicts. + """ + + def __init__(self): + super().__init__() + + # initialize component collections as dicts + for key in self.data: + self.data[key] = {} + + def get_collection_ids(self, component_type: str): + return list(self.get_collection(component_type).keys()) + + def has_data_for_component(self, component_type: str, component_id: str) -> bool: + collection = self.get_collection(component_type) + return component_id in collection + + def initialize_data_for_component( + self, component_type: str, component_id: str, overwrite: bool = False, initial_value=None + ): + collection = self.get_collection(component_type) + if overwrite or not self.has_data_for_component(component_type, component_id): + collection[component_id] = initial_value if initial_value is not None else {} + + def get_data( + self, + component_type: str, + component_id: str, + property_name: Optional[str] = None, + index=None, + ): + collection = self.get_collection(component_type) + if component_id not in collection: + raise KeyError( + f"This StorageManager does not have an entry for the {component_type} with id {component_id}." + ) + if property_name is None: + return collection[component_id] + else: + return collection[component_id][property_name] + + def update_data( + self, + component_type: str, + component_id: str, + property_name: str, + new_value, + index=None, + ): + collection = self.get_collection(component_type) + # don't create new collections if the ID is not found; this is supposed to be handled in the + # CorpusComponent constructor so if the ID is missing that indicates something is wrong + if component_id not in collection: + raise KeyError( + f"This StorageManager does not have an entry for the {component_type} with id {component_id}." + ) + collection[component_id][property_name] = new_value + + def delete_data( + self, component_type: str, component_id: str, property_name: Optional[str] = None + ): + collection = self.get_collection(component_type) + if component_id not in collection: + raise KeyError( + f"This StorageManager does not have an entry for the {component_type} with id {component_id}." + ) + if property_name is None: + del collection[component_id] + else: + del collection[component_id][property_name] + + def clear_all_data(self): + for key in self.data: + self.data[key] = {} + + def count_entries(self, component_type: str): + return len(self.get_collection(component_type)) + + +class DBStorageManager(StorageManager): + """ + Concrete StorageManager implementation for database-backed data storage. + Collections are implemented as MongoDB collections. + """ + + def __init__(self, collection_prefix, db_host: Optional[str] = None): + super().__init__() + + self.collection_prefix = collection_prefix + self.client = MongoClient(db_host) + self.db = self.client["convokit"] + + # this special lock is used for reconnecting to an existing DB, whereupon + # it is known that all the data already exists and so the initialization + # step can be skipped, greatly saving time + self.bypass_init = False + + # initialize component collections as MongoDB collections in the convokit db + for key in self.data: + self.data[key] = self.db[self._get_collection_name(key)] + + def _get_collection_name(self, component_type: str) -> str: + return f"{self.collection_prefix}_{component_type}" + + def get_collection_ids(self, component_type: str): + return [ + doc["_id"] + for doc in self.db[self._get_collection_name(component_type)].find(projection=["_id"]) + ] + + def has_data_for_component(self, component_type: str, component_id: str) -> bool: + collection = self.get_collection(component_type) + lookup = collection.find_one({"_id": component_id}) + return lookup is not None + + def initialize_data_for_component( + self, component_type: str, component_id: str, overwrite: bool = False, initial_value=None + ): + if self.bypass_init: + return + collection = self.get_collection(component_type) + if overwrite or not self.has_data_for_component(component_type, component_id): + data = initial_value if initial_value is not None else {} + collection.replace_one({"_id": component_id}, data, upsert=True) + + def get_data( + self, + component_type: str, + component_id: str, + property_name: Optional[str] = None, + index=None, + ): + collection = self.get_collection(component_type) + all_fields = collection.find_one({"_id": component_id}) + if all_fields is None: + raise KeyError( + f"This StorageManager does not have an entry for the {component_type} with id {component_id}." + ) + if property_name is None: + # if some data is known to be binary type, unpack it + if index is not None: + for key in all_fields: + if index.get(key, None) == ["bin"]: + all_fields[key] = pickle.loads(all_fields[key]) + # do not include the MongoDB-specific _id field + del all_fields["_id"] + return all_fields + else: + result = all_fields[property_name] + if index is not None and index.get(property_name, None) == ["bin"]: + # binary data must be unpacked + result = pickle.loads(result) + return result + + def update_data( + self, + component_type: str, + component_id: str, + property_name: str, + new_value, + index=None, + ): + data = self.get_data(component_type, component_id) + if index is not None and index.get(property_name, None) == ["bin"]: + # non-serializable types must go through pickling then be encoded as bson.Binary + new_value = bson.Binary(pickle.dumps(new_value)) + data[property_name] = new_value + collection = self.get_collection(component_type) + collection.update_one({"_id": component_id}, {"$set": data}) + + def delete_data( + self, component_type: str, component_id: str, property_name: Optional[str] = None + ): + collection = self.get_collection(component_type) + if property_name is None: + # delete the entire document + collection.delete_one({"_id": component_id}) + else: + # delete only the specified property + collection.update_one({"_id": component_id}, {"$unset": {property_name: ""}}) + + def clear_all_data(self): + for key in self.data: + self.data[key].drop() + self.data[key] = self.db[self._get_collection_name(key)] + + def count_entries(self, component_type: str): + return self.get_collection(component_type).estimated_document_count() + +class ConvoKitIndex: + def __init__( + self, + owner, + utterances_index: Optional[Dict[str, List[str]]] = None, + speakers_index: Optional[Dict[str, List[str]]] = None, + conversations_index: Optional[Dict[str, List[str]]] = None, + overall_index: Optional[Dict[str, List[str]]] = None, + vectors: Optional[List[str]] = None, + version: Optional[int] = 0, + ): + self.owner = owner + self.utterances_index = utterances_index if utterances_index is not None else {} + self.speakers_index = speakers_index if speakers_index is not None else {} + self.conversations_index = conversations_index if conversations_index is not None else {} + self.overall_index = overall_index if overall_index is not None else {} + self.indices = { + "utterance": self.utterances_index, + "conversation": self.conversations_index, + "speaker": self.speakers_index, + "corpus": self.overall_index, + } + self.vectors = set(vectors) if vectors is not None else set() + self.version = version + self.type_check = True # toggle-able to enable/disable type checks on metadata additions + self.lock_metadata_deletion = {"utterance": True, "conversation": True, "speaker": True} + + def create_new_index(self, obj_type: str, key: str): + """ + Create a new entry in the obj_type index with a blank type list, + representing an "Any" type which might be later refined. + :param obj_type: utterance, conversation, or speaker + :param key: string + :param class_type: class type + """ + if key not in self.indices[obj_type]: + self.indices[obj_type][key] = [] + + def update_index(self, obj_type: str, key: str, class_type: str): + """ + Append the class_type to the index + :param obj_type: utterance, conversation, or speaker + :param key: string + :param class_type: class type + :return: None + """ + assert type(key) == str + assert "class" in class_type or class_type == "bin" + if key not in self.indices[obj_type]: + self.indices[obj_type][key] = [] + self.indices[obj_type][key].append(class_type) + + def set_index(self, obj_type: str, key: str, class_type: str): + """ + Set the class_type of the index as [`class_type`]. + :param obj_type: utterance, conversation, or speaker + :param key: string + :param class_type: class type + :return: None + """ + assert type(key) == str + assert "class" in class_type or class_type == "bin" + self.indices[obj_type][key] = [class_type] + + def get_index(self, obj_type: str): + return self.indices[obj_type] + + def del_from_index(self, obj_type: str, key: str): + assert type(key) == str + if key not in self.indices[obj_type]: + return + del self.indices[obj_type][key] + # + # corpus = self.owner + # for corpus_obj in corpus.iter_objs(obj_type): + # if key in corpus_obj.meta: + # del corpus_obj.meta[key] + + def add_vector(self, vector_name): + self.vectors.add(vector_name) + + def del_vector(self, vector_name): + self.vectors.remove(vector_name) + + def update_from_dict(self, meta_index: Dict): + self.conversations_index.update(meta_index["conversations-index"]) + self.utterances_index.update(meta_index["utterances-index"]) + speaker_index = "speakers-index" if "speakers-index" in meta_index else "users-index" + self.speakers_index.update(meta_index[speaker_index]) + self.overall_index.update(meta_index["overall-index"]) + self.vectors = set(meta_index.get("vectors", set())) + for index in self.indices.values(): + for k, v in index.items(): + if isinstance(v, str): + index[k] = [v] + + self.version = meta_index["version"] + + def to_dict(self, exclude_vectors: List[str] = None, force_version=None): + retval = dict() + retval["utterances-index"] = self.utterances_index + retval["speakers-index"] = self.speakers_index + retval["conversations-index"] = self.conversations_index + retval["overall-index"] = self.overall_index + + if force_version is None: + retval["version"] = self.version + 1 + else: + retval["version"] = force_version + + if exclude_vectors is not None: + retval["vectors"] = list(self.vectors - set(exclude_vectors)) + else: + retval["vectors"] = list(self.vectors) + + return retval + + def enable_type_check(self): + self.type_check = True + + def disable_type_check(self): + self.type_check = False + + def __str__(self): + return str(self.to_dict(force_version=self.version)) + + def __repr__(self): + return str(self) + + +### HELPER FUNCTIONS + +def extract_meta_from_df(df): + meta_cols = [col.split(".")[1] for col in df if col.startswith("meta")] + return meta_cols + +def get_corpus_dirpath(filename: str) -> Optional[str]: + if filename is None: + return None + elif os.path.isdir(filename): + return filename + else: + return os.path.dirname(filename) + +def get_corpus_id( + db_collection_prefix: Optional[str], filename: Optional[str], storage_type: str +) -> Optional[str]: + if db_collection_prefix is not None: + # treat the unique collection prefix as the ID (even if a filename is specified) + corpus_id = db_collection_prefix + elif filename is not None: + # automatically derive an ID from the file path + corpus_id = os.path.basename(os.path.normpath(filename)) + else: + corpus_id = None + + if storage_type == "db" and corpus_id is not None: + compatibility_msg = check_id_for_mongodb(corpus_id) + if compatibility_msg is not None: + random_id = create_safe_id() + warn( + f'Attempting to use "{corpus_id}" as DB collection prefix failed because: {compatibility_msg}. Will instead use randomly generated prefix {random_id}.' + ) + corpus_id = random_id + + return corpus_id + +def initialize_storage( + corpus: "Corpus", storage: Optional[StorageManager], storage_type: str, db_host: Optional[str] +): + if storage is not None: + return storage + else: + if storage_type == "mem": + return MemStorageManager() + elif storage_type == "db": + if db_host is None: + db_host = corpus.config.db_host + return DBStorageManager(corpus.id, db_host) + else: + raise ValueError( + f"Unrecognized setting '{storage_type}' for storage type; should be either 'mem' or 'db'." + ) + +def initialize_conversations( + corpus, convos_data, convo_to_utts=None, fill_missing_convo_ids: bool = False +): + """ + Initialize Conversation objects from utterances and conversations data. + If a mapping from Conversation IDs to their constituent Utterance IDs is + already known (e.g., as a side effect of a prior computation) they can be + directly provided via the convo_to_utts parameter, otherwise the mapping + will be computed by iteration over the Utterances in utt_dict. + """ + if fill_missing_convo_ids: + fill_missing_conversation_ids(corpus.utterances) + + # organize utterances by conversation + if convo_to_utts is None: + convo_to_utts = defaultdict(list) # temp container identifying utterances by conversation + for utt in corpus.utterances.values(): + convo_key = ( + utt.conversation_id + ) # each conversation_id is considered a separate conversation + convo_to_utts[convo_key].append(utt.id) + conversations = {} + for convo_id in convo_to_utts: + # look up the metadata associated with this conversation, if any + convo_data = convos_data.get(convo_id, None) + if convo_data is not None: + if KeyMeta in convo_data: + convo_meta = convo_data[KeyMeta] + else: + convo_meta = convo_data + else: + convo_meta = None + + convo = Conversation( + owner=corpus, id=convo_id, utterances=convo_to_utts[convo_id], meta=convo_meta + ) + + if convo_data is not None and KeyVectors in convo_data and KeyMeta in convo_data: + convo.vectors = convo_data.get(KeyVectors, []) + conversations[convo_id] = convo + return conversations + + +def fill_missing_conversation_ids(utterances_dict: Dict[str, Utterance]) -> None: + """ + Populates `conversation_id` in Utterances that have `conversation_id` set to `None`, with a Conversation root-specific generated ID + :param utterances_dict: + :return: + """ + utts_without_convo_ids = [ + utt for utt in utterances_dict.values() if utt.conversation_id is None + ] + utt_ids_to_replier_ids = defaultdict(deque) + convo_roots_without_convo_ids = [] + convo_roots_with_convo_ids = [] + for utt in utterances_dict.values(): + if utt.reply_to is None: + if utt.conversation_id is None: + convo_roots_without_convo_ids.append(utt.id) + else: + convo_roots_with_convo_ids.append(utt.id) + else: + utt_ids_to_replier_ids[utt.reply_to].append(utt.id) + + # connect the reply-to edges for convo roots without convo ids + for root_utt_id in convo_roots_without_convo_ids: + generated_conversation_id = Conversation.generate_default_conversation_id( + utterance_id=root_utt_id + ) + utterances_dict[root_utt_id].conversation_id = generated_conversation_id + _update_reply_to_chain_with_conversation_id( + utterances_dict=utterances_dict, + utt_ids_to_replier_ids=utt_ids_to_replier_ids, + root_utt_id=root_utt_id, + conversation_id=generated_conversation_id, + ) + + # Previous section handles all *new* conversations + # Next section handles utts that belong to existing conversations + for root_utt_id in convo_roots_with_convo_ids: + conversation_id = utterances_dict[root_utt_id].conversation_id + _update_reply_to_chain_with_conversation_id( + utterances_dict=utterances_dict, + utt_ids_to_replier_ids=utt_ids_to_replier_ids, + root_utt_id=root_utt_id, + conversation_id=conversation_id, + ) + + # It's still possible to have utts that reply to non-existent utts + # These are the utts that do not have a conversation_id even at this step + for utt in utts_without_convo_ids: + if utt.conversation_id is None: + raise ValueError( + f"Invalid Utterance found: Utterance {utt.id} replies to an Utterance '{utt.reply_to}' that does not exist." + ) + +def _update_reply_to_chain_with_conversation_id( + utterances_dict: Dict[str, Utterance], + utt_ids_to_replier_ids: Dict[str, Iterable[str]], + root_utt_id: str, + conversation_id: str, +): + repliers = utt_ids_to_replier_ids.get(root_utt_id, deque()) + while len(repliers) > 0: + replier_id = repliers.popleft() + utterances_dict[replier_id].conversation_id = conversation_id + repliers.extend(utt_ids_to_replier_ids[replier_id]) + +def dump_corpus_component( + corpus, dir_name, filename, obj_type, bin_name, exclude_vectors, fields_to_skip +): + with open(os.path.join(dir_name, filename), "w") as f: + d_bin = defaultdict(list) + objs = defaultdict(dict) + for obj_id in corpus.get_object_ids(obj_type): + objs[obj_id][KeyMeta] = dump_helper_bin( + corpus.get_object(obj_type, obj_id).meta, d_bin, fields_to_skip.get(obj_type, []) + ) + obj_vectors = corpus.get_object(obj_type, obj_id).vectors + objs[obj_id][KeyVectors] = ( + obj_vectors + if exclude_vectors is None + else list(set(obj_vectors) - set(exclude_vectors)) + ) + json.dump(objs, f) + + for name, l_bin in d_bin.items(): + with open(os.path.join(dir_name, name + "-{}-bin.p".format(bin_name)), "wb") as f_pk: + pickle.dump(l_bin, f_pk) + +BIN_DELIM_L, BIN_DELIM_R = "<##bin{", "}&&@**>" +KeyId = "id" +KeySpeaker = "speaker" +KeyConvoId = "conversation_id" +KeyReplyTo = "reply-to" +KeyTimestamp = "timestamp" +KeyText = "text" +DefinedKeys = {KeyId, KeySpeaker, KeyConvoId, KeyReplyTo, KeyTimestamp, KeyText} +KeyMeta = "meta" +KeyVectors = "vectors" + +JSONLIST_BUFFER_SIZE = 1000 + +def dump_helper_bin(d: ConvoKitMeta, d_bin: Dict, fields_to_skip=None) -> Dict: # object_idx + """ + :param d: The ConvoKitMeta to encode + :param d_bin: The dict of accumulated lists of binary attribs + :return: + """ + if fields_to_skip is None: + fields_to_skip = [] + + obj_idx = d.index.get_index(d.obj_type) + d_out = {} + for k, v in d.items(): + if k in fields_to_skip: + continue + try: + if len(obj_idx[k]) > 0 and obj_idx[k][0] == "bin": + d_out[k] = "{}{}{}".format(BIN_DELIM_L, len(d_bin[k]), BIN_DELIM_R) + d_bin[k].append(v) + else: + d_out[k] = v + except KeyError: + # fails silently (object has no such metadata that was indicated in metadata index) + pass + return d_out + +def dump_utterances(corpus, dir_name, exclude_vectors, fields_to_skip): + with open(os.path.join(dir_name, "utterances.jsonl"), "w") as f: + d_bin = defaultdict(list) + + for ut in corpus.iter_utterances(): + ut_obj = { + KeyId: ut.id, + KeyConvoId: ut.conversation_id, + KeyText: ut.text, + KeySpeaker: ut.speaker.id, + KeyMeta: dump_helper_bin(ut.meta, d_bin, fields_to_skip.get("utterance", [])), + KeyReplyTo: ut.reply_to, + KeyTimestamp: ut.timestamp, + KeyVectors: ut.vectors + if exclude_vectors is None + else list(set(ut.vectors) - set(exclude_vectors)), + } + json.dump(ut_obj, f) + f.write("\n") + + for name, l_bin in d_bin.items(): + with open(os.path.join(dir_name, name + "-bin.p"), "wb") as f_pk: + pickle.dump(l_bin, f_pk) From 3e97810f077129d0942a280c1ee98194c35620d4 Mon Sep 17 00:00:00 2001 From: liesenf Date: Sat, 22 Apr 2023 10:14:17 +0200 Subject: [PATCH 07/12] sync ipynb --- notebooks/exploration.ipynb | 74 ++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/notebooks/exploration.ipynb b/notebooks/exploration.ipynb index 9fed698..ab344fb 100644 --- a/notebooks/exploration.ipynb +++ b/notebooks/exploration.ipynb @@ -18,7 +18,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## The `csv_to_json` module\n", + "## The `from_convokit` module\n", "\n", "Here, we import a single class from the `csv_to_json` module, and use it." ] @@ -29,13 +29,75 @@ "metadata": {}, "outputs": [], "source": [ - "import sktalk.csv_to_json as cj\n" + "import sktalk.from_convokit as ck" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"../data/ulwa_testdata_convokit_format.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ID column is not present in utterances dataframe, generated ID column from dataframe index...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "10it [00:00, 4043.87it/s]\n" + ] + } + ], + "source": [ + "corpus = ck.Corpus.from_pandas(utterances_df = df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "corpus.dump(name ='testcorpus', base_path=\"../data/\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The `csv_to_json` module\n", + "\n", + "Here, we import a single class from the `csv_to_json` module, and use it." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import sktalk.csv_to_json as cj\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, "outputs": [ { "data": { @@ -222,7 +284,7 @@ "9 atïm inim. " ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -235,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -244,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -253,7 +315,7 @@ "'{\"begin\":{\"0\":\"00:00:00.917\",\"1\":\"00:00:04.830\",\"2\":\"00:00:06.090\",\"3\":\"00:00:09.534\",\"4\":\"00:00:10.333\",\"5\":\"00:00:11.143\",\"6\":\"00:00:11.477\",\"7\":\"00:00:14.390\",\"8\":\"00:00:17.972\",\"9\":\"00:00:18.240\"},\"end\":{\"0\":\"00:00:05.604\",\"1\":\"00:00:09.080\",\"2\":\"00:00:09.450\",\"3\":\"00:00:10.333\",\"4\":\"00:00:11.143\",\"5\":\"00:00:18.240\",\"6\":\"00:00:12.205\",\"7\":\"00:00:15.696\",\"8\":\"00:00:20.722\",\"9\":\"00:00:21.970\"},\"participant\":{\"0\":\"Tang\",\"1\":\"Yan\",\"2\":\"Tang\",\"3\":\"Yan\",\"4\":\"Tang\",\"5\":\"Yan\",\"6\":\"Tang\",\"7\":\"Yan\",\"8\":\"Tang\",\"9\":\"Yan\"},\"utterance\":{\"0\":\"U oughs inim t\\\\u00ef samting yan\",\"1\":\"mbam ndul ma wandam ana\",\"2\":\"M\\\\u00ef inim wandam bai anapa nd\",\"3\":\"lunda we nd\\\\u00efm\\\\u00efne in\",\"4\":\"k\\\\u00efnakape ak\\\\u00efnaka\",\"5\":\"coughs nd\\\\u00efm\\\\u00efne we ndul wa le we nd\\\\u00eft\\\\u00ef ak\\\\u00efnakape malimap mat\\\\u00ef yawa mananda\",\"6\":\"mananda\",\"7\":\"da\",\"8\":\"e k\\\\u00efkal awi ak\\\\u00efnakape\",\"9\":\"at\\\\u00efm inim.\"},\"translation\":{\"0\":\"Lorem ipsum dolor sit amet.\",\"1\":\"At neque fugit eum reprehenderit labore et exercitationem voluptatem. eos odio aspernatur.\",\"2\":\"a veritatis tempore sit vitae quaerat sed consequatur amet qui nisi facilis et perferendis nisi ut maiores consequatur.\",\"3\":null,\"4\":null,\"5\":\"Et illo facere vel magni necessitatibus est aspernatur numquam\",\"6\":null,\"7\":null,\"8\":\"onsequatur amet qui nisi facilis et perferendis nisi ut\",\"9\":\"itae quaerat sed consequatur amet\"},\"source\":{\"0\":\"\\\\/ulwa1\\\\/ulwa014\",\"1\":\"\\\\/ulwa1\\\\/ulwa014\",\"2\":\"\\\\/ulwa1\\\\/ulwa014\",\"3\":\"\\\\/ulwa1\\\\/ulwa014\",\"4\":\"\\\\/ulwa1\\\\/ulwa014\",\"5\":\"\\\\/ulwa1\\\\/ulwa014\",\"6\":\"\\\\/ulwa1\\\\/ulwa014\",\"7\":\"\\\\/ulwa1\\\\/ulwa014\",\"8\":\"\\\\/ulwa1\\\\/ulwa014\",\"9\":\"\\\\/ulwa1\\\\/ulwa014\"},\"utterance_raw\":{\"0\":\"U oughs inim t\\\\u00ef samting yangama ul mat\\\\u00ef ak\\\\u00efnakape\",\"1\":\"wimbam ndul ma wandam anapa ol welunda n\\\\u00efkap tu mananda yangama \",\"2\":\"M\\\\u00ef inim wandam bai anapa nd\\\\u00eft\\\\u00ef ka welunda unan\",\"3\":\"ata welunda we nd\\\\u00efm\\\\u00efne ind\",\"4\":\"i ak\\\\u00efnakape ak\\\\u00efnakap\",\"5\":\"[coughs] I inim oughs ka lopop mananda bai k\\\\u00efkal yangama we ini\",\"6\":\"n mananda nd\\\\u00eft\\\\u00ef ka ak\\\\u00efnakape wimbam\",\"7\":\"da nd\\\\u00eft\\\\u00ef ka\",\"8\":\"e k\\\\u00efkal awi ak\\\\u00efnakape man\\\\u00ef l\\\\u00ef\",\"9\":\"at\\\\u00efm inim.\"}}'" ] }, - "execution_count": 6, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } From dea127417fe3b8e2e76d3cbff9fc3b9dd8c6202e Mon Sep 17 00:00:00 2001 From: liesenf Date: Sat, 22 Apr 2023 11:01:18 +0200 Subject: [PATCH 08/12] branch to build and test the wav and mp3 reader --- notebooks/Make_audio_column.ipynb | 918 ++++++++++++++++++++++++++++++ sktalk/read_audio.py | 30 + 2 files changed, 948 insertions(+) create mode 100644 notebooks/Make_audio_column.ipynb create mode 100644 sktalk/read_audio.py diff --git a/notebooks/Make_audio_column.ipynb b/notebooks/Make_audio_column.ipynb new file mode 100644 index 0000000..5371296 --- /dev/null +++ b/notebooks/Make_audio_column.ipynb @@ -0,0 +1,918 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "RUcScYxiyCLa" + }, + "source": [ + "### To do list\n", + "todo:\n", + "- make debug function that logs errors\n", + "- make mel matrix \n", + "- make spec col" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kF0UisGqTpR5" + }, + "source": [ + "### Set working directory" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "223EdHm_Tnod", + "outputId": "96e4f251-729d-4d90-c914-45e5c7cdfd82" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current working directory: /vol/tensusers2/aliesenfeld/Elpaco dataset\n", + "Current working directory: /vol/tensusers2/aliesenfeld/Elpaco dataset\n" + ] + } + ], + "source": [ + "# Import the os module\n", + "import os\n", + "\n", + "# Print the current working directory\n", + "print(\"Current working directory: {0}\".format(os.getcwd()))\n", + "\n", + "# Change the current working directory\n", + "os.chdir('/vol/tensusers2/aliesenfeld/Elpaco dataset/')\n", + "#os.chdir('/Users/u517177/continuer_paper/')\n", + "\n", + "\n", + "# Print the current working directory\n", + "print(\"Current working directory: {0}\".format(os.getcwd()))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 1: audio extraction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u-YN_F_oaAgz" + }, + "source": [ + "### Dependencies\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "dXTbO87ZZ_ao" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1354822/732630340.py:9: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import tqdm\n" + ] + } + ], + "source": [ + "#on M1 mac librosa needs to be installed via miniforge\n", + "#!conda install -c conda-forge librosa\n", + "import wave\n", + "import IPython\n", + "import librosa\n", + "import pandas as pd\n", + "from scipy.io import wavfile\n", + "import matplotlib.pyplot as plt\n", + "from tqdm.autonotebook import tqdm\n", + "from joblib import Parallel, delayed\n", + "n_jobs = -1; verbosity = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m\u001b[01;34m'Elpaco dataset'\u001b[0m/ \u001b[01;34moutput\u001b[0m/ streaks_10perlang.csv\n", + "\u001b[01;34m'continuer data'\u001b[0m/ \u001b[01;34moutput_trail\u001b[0m/ streaks_1perlang.csv\n", + " continuers_for_audio.csv \u001b[01;34mstreaks\u001b[0m/ streaks_50perlang.csv\n", + " discontinuers_for_audio.csv streaks.csv zic1z6cK\n" + ] + } + ], + "source": [ + "#install tensorflow if needed\n", + "\n", + "#!conda install -c conda-forge tensorflow -y\n", + "#!pip install tensorflow-macos" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect single audio files" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of channels 2\n", + "Sample width 2\n", + "Frame rate. 48000\n", + "Number of frames 73200000\n", + "parameters: _wave_params(nchannels=2, sampwidth=2, framerate=48000, nframes=73200000, comptype='NONE', compname='not compressed')\n" + ] + } + ], + "source": [ + "#Load and inspect single files\n", + "audiofile = wave.open('./Elpaco dataset/akhoe_haikom1/state_hospital.wav','r')\n", + "\n", + "print( \"Number of channels\",audiofile.getnchannels())\n", + "print ( \"Sample width\",audiofile.getsampwidth())\n", + "print ( \"Frame rate.\",audiofile.getframerate())\n", + "print (\"Number of frames\",audiofile.getnframes())\n", + "print ( \"parameters:\",audiofile.getparams())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#play audio files\n", + "IPython.display.Audio('./Elpaco dataset/akhoe_haikom1/state_hospital.wav')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### read csv" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uidlanguagebeginenddurationsourceparticipantutterance_strippedutteranceform_ascii
0akhoe_haikom-2-194-371425akhoe_haikom371.425371.725300.0/akhoe_haikom1/state_hospitaltx@Esîîi_
1akhoe_haikom-2-203-398335akhoe_haikom398.335398.545210.0/akhoe_haikom1/state_hospitaltx@Esîîi_
2akhoe_haikom-2-457-1245412akhoe_haikom1245.4121245.780368.0/akhoe_haikom1/state_hospitaltx@Gaîîi_
3akhoe_haikom-2-459-1247720akhoe_haikom1247.7201248.010290.0/akhoe_haikom1/state_hospitaltx@Gaîîi_
4akhoe_haikom-2-482-1290491akhoe_haikom1290.4911290.851360.0/akhoe_haikom1/state_hospitaltx@Gaîîi_
.................................
139546zaar-2-142-181303zaar181.303181.672369.0/zaar1/SAY_BC_CONV_02tx@SP1tôːtôːto_ː
139547zaar-2-367-449687zaar449.687450.145458.0/zaar1/SAY_BC_CONV_02tx@SP1m̀ːm̀ːm_ː
139548zaar-2-368-450762zaar450.762451.085323.0/zaar1/SAY_BC_CONV_02tx@SP1m̀ːm̀ːm_ː
139549zaar-3-459-440411zaar440.411440.985574.0/zaar1/SAY_BC_CONV_03tx@SP2m̀ːm̀ːm_ː
139550zaar-3-462-442380zaar442.380442.771391.0/zaar1/SAY_BC_CONV_03tx@SP2m̀ːm̀ːm_ː
\n", + "

139551 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " uid language begin end \\\n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371.425 371.725 \n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398.335 398.545 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245.412 1245.780 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247.720 1248.010 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290.491 1290.851 \n", + "... ... ... ... ... \n", + "139546 zaar-2-142-181303 zaar 181.303 181.672 \n", + "139547 zaar-2-367-449687 zaar 449.687 450.145 \n", + "139548 zaar-2-368-450762 zaar 450.762 451.085 \n", + "139549 zaar-3-459-440411 zaar 440.411 440.985 \n", + "139550 zaar-3-462-442380 zaar 442.380 442.771 \n", + "\n", + " duration source participant \\\n", + "0 300.0 /akhoe_haikom1/state_hospital tx@Es \n", + "1 210.0 /akhoe_haikom1/state_hospital tx@Es \n", + "2 368.0 /akhoe_haikom1/state_hospital tx@Ga \n", + "3 290.0 /akhoe_haikom1/state_hospital tx@Ga \n", + "4 360.0 /akhoe_haikom1/state_hospital tx@Ga \n", + "... ... ... ... \n", + "139546 369.0 /zaar1/SAY_BC_CONV_02 tx@SP1 \n", + "139547 458.0 /zaar1/SAY_BC_CONV_02 tx@SP1 \n", + "139548 323.0 /zaar1/SAY_BC_CONV_02 tx@SP1 \n", + "139549 574.0 /zaar1/SAY_BC_CONV_03 tx@SP2 \n", + "139550 391.0 /zaar1/SAY_BC_CONV_03 tx@SP2 \n", + "\n", + " utterance_stripped utterance form_ascii \n", + "0 î î i_ \n", + "1 î î i_ \n", + "2 î î i_ \n", + "3 î î i_ \n", + "4 î î i_ \n", + "... ... ... ... \n", + "139546 tôː tôː to_ː \n", + "139547 m̀ː m̀ː m_ː \n", + "139548 m̀ː m̀ː m_ː \n", + "139549 m̀ː m̀ː m_ː \n", + "139550 m̀ː m̀ː m_ː \n", + "\n", + "[139551 rows x 10 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#read csv \n", + "df_audio = pd.read_csv('continuers_for_audio.csv')\n", + "df_audio" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocess csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Optional:\n", + "#check timestamp format and .div is needed to match audio extaction function requirements\n", + "df_audio['begin'] = df_audio['begin'].div(1000)\n", + "df_audio['end'] = df_audio['end'].div(1000)\n", + "df_audio" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uidlanguagebeginenddurationsourceparticipantutterance_strippedutteranceform_ascii
0akhoe_haikom-2-494-1312920akhoe_haikom1312.9201313.280360.0/akhoe_haikom1/state_hospitaltx@Gaîîi_
1akie-1-033-99525akie99.525100.289764.0/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyoBaa_
2ambel-3-147-222600ambel222.600223.085485.0/ambel1/AM064ESD_Transcription-txt-wgoiiiiii
3ambel-4-470-662638ambel662.638663.212574.0/ambel1/AM067WG_Transcription-txt-wgommmmmm
4anal-01-098-184010anal184.010184.379369.0/anal1/anm_20160916_PO_Wolring_1Anal sp3mmmmmm
.................................
851yeli_dnye-1-457-1355836yeli_dnye1355.8361356.486650.0/yeli_dnye1/r03_v19_s2Knyââ[unk_utterance]nyâânya_â
852yeli_dnye-2-129-591369yeli_dnye591.369591.989620.0/yeli_dnye1/r03_v20_s5Kpmmmmmm
853yeli_dnye-3-113-151660yeli_dnye151.660151.920260.0/yeli_dnye1/r03_v21_s1Jamesnyâânyâânya_â
854zaar-2-368-450762zaar450.762451.085323.0/zaar1/SAY_BC_CONV_02tx@SP1m̀ːm̀ːm_ː
855zaar-3-462-442380zaar442.380442.771391.0/zaar1/SAY_BC_CONV_03tx@SP2m̀ːm̀ːm_ː
\n", + "

856 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " uid language begin end duration \\\n", + "0 akhoe_haikom-2-494-1312920 akhoe_haikom 1312.920 1313.280 360.0 \n", + "1 akie-1-033-99525 akie 99.525 100.289 764.0 \n", + "2 ambel-3-147-222600 ambel 222.600 223.085 485.0 \n", + "3 ambel-4-470-662638 ambel 662.638 663.212 574.0 \n", + "4 anal-01-098-184010 anal 184.010 184.379 369.0 \n", + ".. ... ... ... ... ... \n", + "851 yeli_dnye-1-457-1355836 yeli_dnye 1355.836 1356.486 650.0 \n", + "852 yeli_dnye-2-129-591369 yeli_dnye 591.369 591.989 620.0 \n", + "853 yeli_dnye-3-113-151660 yeli_dnye 151.660 151.920 260.0 \n", + "854 zaar-2-368-450762 zaar 450.762 451.085 323.0 \n", + "855 zaar-3-462-442380 zaar 442.380 442.771 391.0 \n", + "\n", + " source \\\n", + "0 /akhoe_haikom1/state_hospital \n", + "1 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo \n", + "2 /ambel1/AM064 \n", + "3 /ambel1/AM067 \n", + "4 /anal1/anm_20160916_PO_Wolring_1 \n", + ".. ... \n", + "851 /yeli_dnye1/r03_v19_s2 \n", + "852 /yeli_dnye1/r03_v20_s5 \n", + "853 /yeli_dnye1/r03_v21_s1 \n", + "854 /zaar1/SAY_BC_CONV_02 \n", + "855 /zaar1/SAY_BC_CONV_03 \n", + "\n", + " participant utterance_stripped utterance \\\n", + "0 tx@Ga î î \n", + "1 B aá aá \n", + "2 ESD_Transcription-txt-wgo ii ii \n", + "3 WG_Transcription-txt-wgo mm mm \n", + "4 Anal sp3 mm mm \n", + ".. ... ... ... \n", + "851 K nyââ [unk_utterance]nyââ \n", + "852 Kp mm mm \n", + "853 James nyââ nyââ \n", + "854 tx@SP1 m̀ː m̀ː \n", + "855 tx@SP2 m̀ː m̀ː \n", + "\n", + " form_ascii \n", + "0 i_ \n", + "1 aa_ \n", + "2 ii \n", + "3 mm \n", + "4 mm \n", + ".. ... \n", + "851 nya_â \n", + "852 mm \n", + "853 nya_â \n", + "854 m_ː \n", + "855 m_ː \n", + "\n", + "[856 rows x 10 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#drop czech because it has separate speaker channels\n", + "df_audio = df_audio[df_audio.language != \"czech\"]\n", + "\n", + "#exclude corpora without audio files\n", + "df_audio = df_audio[df_audio.language != \"italian\"]\n", + "df_audio = df_audio[df_audio.language != \"zacatepec_chatino\"]\n", + "\n", + "#drop files without audio\n", + "\n", + "df_audio.dropna(subset=['begin','end'], inplace=True)\n", + "df_audio = df_audio.groupby(['source']).apply(lambda x: x.sample(1, replace=True)).reset_index(drop=True)\n", + "\n", + "df_audio" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `read_audio` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_row_audio(df_audio, wav_loc):\n", + " \"\"\" load audio and grab individual turns\n", + " TODO: for large sparse WAV files, the audio should be loaded only for the turn\n", + " \"\"\"\n", + " \n", + " #print path to audio file (optional)\n", + " print(wav_loc)\n", + "\n", + " # load audio\n", + " try:\n", + " #Three options here, comment out as needed:\n", + "\n", + " #scipy wavfile\n", + " #rate, data = wavfile.read(wav_loc)\n", + " \n", + " #librosa\n", + " rate, data = librosa.load(wav_loc)\n", + " \n", + " #base python\n", + " #rate, data = wave.open(wav_loc)\n", + "\n", + " #explore fourth option: use ffmpeg directly \n", + " #ffmpeg?\n", + "\n", + " data = data.astype('float32')\n", + " \n", + " # get audio for each turn\n", + " df_audio[\"audio\"] = [\n", + " data[int(st * rate) : int(et * rate)].copy(deep=True)\n", + " for st, et in zip(df_audio.begin.values, df_audio.end.values)\n", + " ]\n", + "\n", + " df_audio[\"rate\"] = rate\n", + " except Exception:\n", + " pass\n", + "\n", + " return df_audio" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loops for execution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#execute as for loop\n", + "[get_row_audio(df_audio[df_audio.source == source], './Elpaco dataset'+ source +'.wav') for source in df_audio.source]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#execute using parallel \n", + "\n", + "with Parallel(n_jobs=1, verbose=verbosity) as parallel:\n", + " df_audios = parallel(\n", + " delayed(get_row_audio)(\n", + " df_audio[df_audio.source == source], \n", + " './Elpaco dataset'+ source +'.wav', #Edit path to corpus directory here\n", + " )\n", + " for source in tqdm(df_audio.source)\n", + " )\n", + "df_audio = pd.concat(df_audios)\n", + "len(df_audio)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normalize audio (optional)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# normalize audio if needed\n", + "df_audio['audio'] = [librosa.util.normalize(i) for i in df_audio.audio.values]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot `audio` column waveforms" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▍ | 11/240 [00:00<00:00, 254.29it/s]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# plot some example audio \n", + "nrows = 20\n", + "ncols = 12\n", + "zoom = 2\n", + "fig, axs = plt.subplots(ncols=ncols, nrows = nrows,figsize = (ncols*zoom, nrows+zoom/1.5))\n", + "for i, turn in tqdm(enumerate(df_audio['audio'].values), total = nrows*ncols):\n", + " ax = axs.flatten()[i]\n", + " ax.plot(turn)\n", + " if i == nrows*ncols -1:\n", + " break" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save as `csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df_audio' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/u517177/3.1_Make_audio_and_spec_cols.ipynb Cell 16\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_audio\u001b[39m.\u001b[39mto_csv(\u001b[39m\"\u001b[39m\u001b[39mdf_audio.csv\u001b[39m\u001b[39m\"\u001b[39m, index\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m df_audio\n", + "\u001b[0;31mNameError\u001b[0m: name 'df_audio' is not defined" + ] + } + ], + "source": [ + "df_audio.to_csv(\"df_audio.csv\", index=False)\n", + "df_audio" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + }, + "vscode": { + "interpreter": { + "hash": "46fbe6e04536e1e35bdd7bb388ca2958a28e6ed66e8c3e8ff3ca43c0ea6b4829" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/sktalk/read_audio.py b/sktalk/read_audio.py new file mode 100644 index 0000000..3f3d988 --- /dev/null +++ b/sktalk/read_audio.py @@ -0,0 +1,30 @@ +"""Documentation about the scikit-talk module.""" + +#module .py to build the read_audio feature + +# FIXME: put actual code here +def hello(name): + """Say hello + + Function docstring using Google docstring style. + + Args: + name (str): Name to say hello to + + Returns: + str: Hello message + + Raises: + ValueError: If `name` is equal to `nobody` + + Example: + This function can be called with `Jane Smith` as argument using + + >>> from sktalk.my_module import hello + >>> hello('Jane Smith') + 'Hello Jane Smith!' + + """ + if name == 'nobody': + raise ValueError('Can not say hello to nobody') + return f'Hello {name}!' From e972109a39b6449bf73d6503101b05c5855f5ae4 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Sun, 14 May 2023 22:01:51 +0200 Subject: [PATCH 09/12] add functions to parse audio with ffmpeg --- sktalk/read_audio.py | 54 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/sktalk/read_audio.py b/sktalk/read_audio.py index 3f3d988..57ad989 100644 --- a/sktalk/read_audio.py +++ b/sktalk/read_audio.py @@ -1,30 +1,42 @@ -"""Documentation about the scikit-talk module.""" +import subprocess +import json +import numpy as np -#module .py to build the read_audio feature +def get_sampling_rate(file_path): + cmd = [ + "ffprobe", + "-v", "quiet", + "-print_format", "json", + "-show_streams", + file_path + ] -# FIXME: put actual code here -def hello(name): - """Say hello + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output = json.loads(result.stdout) - Function docstring using Google docstring style. + for stream in output["streams"]: + if stream["codec_type"] == "audio": + return int(stream["sample_rate"]) - Args: - name (str): Name to say hello to + raise ValueError("No audio stream found in the file") - Returns: - str: Hello message +# Replace 'list_audio_3_40_balanced.wav' with the path to your audio file +# file_path = './Elpaco dataset/akhoe_haikom1/state_hospital.wav' - Raises: - ValueError: If `name` is equal to `nobody` +# sampling_rate = get_sampling_rate(file_path) +# print(sampling_rate) - Example: - This function can be called with `Jane Smith` as argument using - >>> from sktalk.my_module import hello - >>> hello('Jane Smith') - 'Hello Jane Smith!' +def get_audio_ffmpeg(file_path): + cmd = ["ffmpeg", "-i", file_path, '-f', 's16le', + '-acodec', 'pcm_s16le', + '-ar', '22050', + '-ac', '1', + '-'] - """ - if name == 'nobody': - raise ValueError('Can not say hello to nobody') - return f'Hello {name}!' + pipe = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + raw_audio = pipe.stdout + audio_array = np.frombuffer(raw_audio, dtype="int16") + audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max + + return audio_array From 1258264aad5e75911c82ba7bd0fcfbff4b465d75 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 2 Jun 2023 14:13:20 +0200 Subject: [PATCH 10/12] audio notebook edits Barbara --- notebooks/Make_audio_column_Barbara.ipynb | 2836 +++++++++++++++++++++ 1 file changed, 2836 insertions(+) create mode 100644 notebooks/Make_audio_column_Barbara.ipynb diff --git a/notebooks/Make_audio_column_Barbara.ipynb b/notebooks/Make_audio_column_Barbara.ipynb new file mode 100644 index 0000000..4df6dd7 --- /dev/null +++ b/notebooks/Make_audio_column_Barbara.ipynb @@ -0,0 +1,2836 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "RUcScYxiyCLa" + }, + "source": [ + "### To do list\n", + "todo:\n", + "- make debug function that logs errors\n", + "- make mel matrix \n", + "- make spec col" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# to ensure that the modules can be imported, as they are located in a different folder, add the package root to the path:\n", + "\n", + "import sys\n", + "sys.path.insert(0, \"../\")\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "kF0UisGqTpR5" + }, + "source": [ + "### Set working directory" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "223EdHm_Tnod", + "outputId": "96e4f251-729d-4d90-c914-45e5c7cdfd82" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current working directory: /home/bvreede\n" + ] + } + ], + "source": [ + "# Import the os module\n", + "import os\n", + "\n", + "# Print the current working directory\n", + "print(\"Current working directory: {0}\".format(os.getcwd()))\n", + "\n", + "# Change the current working directory\n", + "datadir = ('/vol/tensusers2/aliesenfeld/Elpaco dataset/')\n", + "#packagedir = ('vol/)\n", + "#os.chdir('/Users/u517177/continuer_paper/')\n", + "\n", + "\n", + "# Print the current working directory\n", + "#print(\"Current working directory: {0}\".format(os.getcwd()))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 1: audio extraction" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "u-YN_F_oaAgz" + }, + "source": [ + "### Dependencies\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "dXTbO87ZZ_ao" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_509310/67386651.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from tqdm.autonotebook import tqdm\n" + ] + } + ], + "source": [ + "#on M1 mac librosa needs to be installed via miniforge\n", + "#!conda install -c conda-forge librosa\n", + "import subprocess\n", + "import json\n", + "import sounddevice as sd\n", + "\n", + "import wave\n", + "import IPython\n", + "import librosa\n", + "import pandas as pd\n", + "from scipy.io import wavfile\n", + "import matplotlib.pyplot as plt\n", + "from tqdm.autonotebook import tqdm\n", + "from joblib import Parallel, delayed\n", + "n_jobs = -1; verbosity = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m\u001b[01;34m'Elpaco dataset'\u001b[0m/ \u001b[01;34moutput\u001b[0m/ streaks_10perlang.csv\n", + "\u001b[01;34m'continuer data'\u001b[0m/ \u001b[01;34moutput_trail\u001b[0m/ streaks_1perlang.csv\n", + " continuers_for_audio.csv \u001b[01;34mstreaks\u001b[0m/ streaks_50perlang.csv\n", + " discontinuers_for_audio.csv streaks.csv zic1z6cK\n" + ] + } + ], + "source": [ + "#install tensorflow if needed\n", + "\n", + "#!conda install -c conda-forge tensorflow -y\n", + "#!pip install tensorflow-macos" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect single audio files" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: './Elpaco dataset/akhoe_haikom1/state_hospital.wav'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m#Load and inspect single files\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m audiofile \u001b[39m=\u001b[39m wave\u001b[39m.\u001b[39;49mopen(\u001b[39m'\u001b[39;49m\u001b[39m./Elpaco dataset/akhoe_haikom1/state_hospital.wav\u001b[39;49m\u001b[39m'\u001b[39;49m,\u001b[39m'\u001b[39;49m\u001b[39mr\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m( \u001b[39m\"\u001b[39m\u001b[39mNumber of channels\u001b[39m\u001b[39m\"\u001b[39m,audiofile\u001b[39m.\u001b[39mgetnchannels())\n\u001b[1;32m 5\u001b[0m \u001b[39mprint\u001b[39m ( \u001b[39m\"\u001b[39m\u001b[39mSample width\u001b[39m\u001b[39m\"\u001b[39m,audiofile\u001b[39m.\u001b[39mgetsampwidth())\n", + "File \u001b[0;32m/usr/lib/python3.8/wave.py:510\u001b[0m, in \u001b[0;36mopen\u001b[0;34m(f, mode)\u001b[0m\n\u001b[1;32m 508\u001b[0m mode \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 509\u001b[0m \u001b[39mif\u001b[39;00m mode \u001b[39min\u001b[39;00m (\u001b[39m'\u001b[39m\u001b[39mr\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m):\n\u001b[0;32m--> 510\u001b[0m \u001b[39mreturn\u001b[39;00m Wave_read(f)\n\u001b[1;32m 511\u001b[0m \u001b[39melif\u001b[39;00m mode \u001b[39min\u001b[39;00m (\u001b[39m'\u001b[39m\u001b[39mw\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mwb\u001b[39m\u001b[39m'\u001b[39m):\n\u001b[1;32m 512\u001b[0m \u001b[39mreturn\u001b[39;00m Wave_write(f)\n", + "File \u001b[0;32m/usr/lib/python3.8/wave.py:160\u001b[0m, in \u001b[0;36mWave_read.__init__\u001b[0;34m(self, f)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_i_opened_the_file \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 159\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(f, \u001b[39mstr\u001b[39m):\n\u001b[0;32m--> 160\u001b[0m f \u001b[39m=\u001b[39m builtins\u001b[39m.\u001b[39;49mopen(f, \u001b[39m'\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 161\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_i_opened_the_file \u001b[39m=\u001b[39m f\n\u001b[1;32m 162\u001b[0m \u001b[39m# else, assume it is an open file object already\u001b[39;00m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './Elpaco dataset/akhoe_haikom1/state_hospital.wav'" + ] + } + ], + "source": [ + "#Load and inspect single files\n", + "audiofile = wave.open('./Elpaco dataset/akhoe_haikom1/state_hospital.wav','r')\n", + "\n", + "print( \"Number of channels\",audiofile.getnchannels())\n", + "print ( \"Sample width\",audiofile.getsampwidth())\n", + "print ( \"Frame rate.\",audiofile.getframerate())\n", + "print (\"Number of frames\",audiofile.getnframes())\n", + "print ( \"parameters:\",audiofile.getparams())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#play audio files\n", + "#IPython.display.Audio('./Elpaco dataset/akhoe_haikom1/state_hospital.wav')\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### read csv" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uidlanguagebeginenddurationsourceparticipantutterance_strippedutteranceform_ascii
0akhoe_haikom-2-484-1291852akhoe_haikom1291.8521292.192340.0/akhoe_haikom1/state_hospitaltx@Gaîîi_
1akie-1-212-449198akie449.198449.791593.0/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyoBaa_
2ambel-3-509-809080ambel809.080810.020940.0/ambel1/AM064ESD_Transcription-txt-wgooooooo
3ambel-4-125-205440ambel205.440206.100660.0/ambel1/AM067WG_Transcription-txt-wgommmmmm
4anal-01-283-507370anal507.370507.890520.0/anal1/anm_20160916_PO_Wolring_1Anal sp2mmmmmm
5anal-05-029-60356anal60.35661.126770.0/anal1/anm_20160924_Thotson_grandmothers_1Anal sp2inginging
6anal-08-437-607867anal607.867608.365498.0/anal1/anm_20161014_PO_Darchol_evening_convers...anal speaker 4ummummumm
7anal-09-264-932963anal932.963934.3831420.0/anal1/anm_20161014_PO_Ralruwng_family_lunch1Anal sp1ummummumm
8anal-12-577-1069261anal1069.2611070.2981037.0/anal1/anm_20161210_oklu_chatting3Anal sp 5ummummumm
9anal-13-047-69501anal69.50170.127626.0/anal1/anm_20190803_grandmasanal speaker 1amatoamatoamato
\n", + "
" + ], + "text/plain": [ + " uid language begin end duration \n", + "0 akhoe_haikom-2-484-1291852 akhoe_haikom 1291.852 1292.192 340.0 \\\n", + "1 akie-1-212-449198 akie 449.198 449.791 593.0 \n", + "2 ambel-3-509-809080 ambel 809.080 810.020 940.0 \n", + "3 ambel-4-125-205440 ambel 205.440 206.100 660.0 \n", + "4 anal-01-283-507370 anal 507.370 507.890 520.0 \n", + "5 anal-05-029-60356 anal 60.356 61.126 770.0 \n", + "6 anal-08-437-607867 anal 607.867 608.365 498.0 \n", + "7 anal-09-264-932963 anal 932.963 934.383 1420.0 \n", + "8 anal-12-577-1069261 anal 1069.261 1070.298 1037.0 \n", + "9 anal-13-047-69501 anal 69.501 70.127 626.0 \n", + "\n", + " source \n", + "0 /akhoe_haikom1/state_hospital \\\n", + "1 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo \n", + "2 /ambel1/AM064 \n", + "3 /ambel1/AM067 \n", + "4 /anal1/anm_20160916_PO_Wolring_1 \n", + "5 /anal1/anm_20160924_Thotson_grandmothers_1 \n", + "6 /anal1/anm_20161014_PO_Darchol_evening_convers... \n", + "7 /anal1/anm_20161014_PO_Ralruwng_family_lunch1 \n", + "8 /anal1/anm_20161210_oklu_chatting3 \n", + "9 /anal1/anm_20190803_grandmas \n", + "\n", + " participant utterance_stripped utterance form_ascii \n", + "0 tx@Ga î î i_ \n", + "1 B aá aá aa_ \n", + "2 ESD_Transcription-txt-wgo oo oo oo \n", + "3 WG_Transcription-txt-wgo mm mm mm \n", + "4 Anal sp2 mm mm mm \n", + "5 Anal sp2 ing ing ing \n", + "6 anal speaker 4 umm umm umm \n", + "7 Anal sp1 umm umm umm \n", + "8 Anal sp 5 umm umm umm \n", + "9 anal speaker 1 amato amato amato " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#read csv \n", + "datasource = datadir + 'continuers_for_audio.csv'\n", + "\n", + "df_audio = pd.read_csv(datasource)\n", + "#Optional:\n", + "#check timestamp format and .div is needed to match audio extaction function requirements\n", + "df_audio['begin'] = df_audio['begin'].div(1000)\n", + "df_audio['end'] = df_audio['end'].div(1000)\n", + "\n", + "#drop czech because it has separate speaker channels\n", + "df_audio = df_audio[df_audio.language != \"czech\"]\n", + "\n", + "#exclude corpora without audio files\n", + "df_audio = df_audio[df_audio.language != \"italian\"]\n", + "df_audio = df_audio[df_audio.language != \"zacatepec_chatino\"]\n", + "\n", + "#drop files without audio\n", + "\n", + "df_audio.dropna(subset=['begin','end'], inplace=True)\n", + "df_audio = df_audio.groupby(['source']).apply(lambda x: x.sample(1, replace=True)).reset_index(drop=True)\n", + "\n", + "df_audio_sub = df_audio[:10]\n", + "df_audio_sub" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `read_audio` function" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'sktalk'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msktalk\u001b[39;00m \u001b[39mimport\u001b[39;00m read_audio\n\u001b[1;32m 3\u001b[0m \u001b[39m# def get_sampling_rate(file_path):\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[39m# cmd = [\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[39m# \"ffprobe\",\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 39\u001b[0m \n\u001b[1;32m 40\u001b[0m \u001b[39m# return audio_array\u001b[39;00m\n\u001b[1;32m 43\u001b[0m wavaddress \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m/vol/tensusers2/aliesenfeld/Elpaco dataset/Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\u001b[39m\u001b[39m\"\u001b[39m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sktalk'" + ] + } + ], + "source": [ + "from sktalk import read_audio\n", + "\n", + "# def get_sampling_rate(file_path):\n", + "# cmd = [\n", + "# \"ffprobe\",\n", + "# \"-v\", \"quiet\",\n", + "# \"-print_format\", \"json\",\n", + "# \"-show_streams\",\n", + "# file_path\n", + "# ]\n", + "\n", + "# result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n", + "# output = json.loads(result.stdout)\n", + "\n", + "# for stream in output[\"streams\"]:\n", + "# if stream[\"codec_type\"] == \"audio\":\n", + "# return int(stream[\"sample_rate\"])\n", + "\n", + "# raise ValueError(\"No audio stream found in the file\")\n", + "\n", + "# # Replace 'list_audio_3_40_balanced.wav' with the path to your audio file\n", + "# # file_path = './Elpaco dataset/akhoe_haikom1/state_hospital.wav'\n", + "\n", + "# # sampling_rate = get_sampling_rate(file_path)\n", + "# # print(sampling_rate)\n", + "\n", + "\n", + "# def get_audio_ffmpeg(file_path):\n", + "# cmd = [\"ffmpeg\", \"-i\", file_path, '-f', 's16le',\n", + "# '-acodec', 'pcm_s16le',\n", + "# '-ar', '22050',\n", + "# '-ac', '1',\n", + "# '-']\n", + "\n", + "# pipe = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", + "# raw_audio = pipe.stdout\n", + "# audio_array = np.frombuffer(raw_audio, dtype=\"int16\")\n", + "# audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max\n", + "\n", + "# return audio_array\n", + "\n", + "\n", + "wavaddress = \"/vol/tensusers2/aliesenfeld/Elpaco dataset/Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\"\n", + "d_ffmpeg = read_audio.get_audio_ffmpeg(wavaddress)\n", + "r_ffmpeg = read_audio.get_sampling_rate(wavaddress)\n", + "\n", + "#ffmpeg -ss start_second -to end_second -i input.mp3 output.mp3\n", + "\n", + "d_librosa, r_librosa = librosa.load(wavaddress)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.0000000e+00 -3.0518509e-05 -3.0518509e-05 -3.0518509e-05\n", + " -6.1037019e-05 -9.1555528e-05 -1.2207404e-04 -1.5259255e-04\n", + " -2.1362957e-04 -2.4414808e-04]\n", + "[ 3.2782555e-07 -4.1872263e-05 -2.2262335e-05 -4.6558678e-05\n", + " -5.6192279e-05 -8.2075596e-05 -1.2717396e-04 -1.5640259e-04\n", + " -2.0396709e-04 -2.4517626e-04]\n", + "24007680\n", + "24007680\n", + "1.0\n", + "1.2539232\n", + "-1.0000305\n", + "-1.0577291\n", + "44100\n", + "22050\n" + ] + } + ], + "source": [ + "print(d_ffmpeg[:10])\n", + "print(d_librosa[:10])\n", + "\n", + "print(len(d_ffmpeg))\n", + "print(len(d_librosa))\n", + "\n", + "print(max(d_ffmpeg))\n", + "print(max(d_librosa))\n", + "\n", + "print(min(d_ffmpeg))\n", + "print(min(d_librosa))\n", + "\n", + "\n", + "print(r_ffmpeg)\n", + "print(r_librosa)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open(\"../barbara_test/d_librosa.pickle\", \"wb\") as f:\n", + " pickle.dump(d_librosa, f)\n", + "\n", + "with open(\"../barbara_test/r_librosa.pickle\", \"wb\") as f:\n", + " pickle.dump(r_librosa, f)\n", + "\n", + "with open(\"../barbara_test/d_ffmpeg.pickle\", \"wb\") as f:\n", + " pickle.dump(d_ffmpeg, f)\n", + "\n", + "with open(\"../barbara_test/r_ffmpeg.pickle\", \"wb\") as f:\n", + " pickle.dump(r_ffmpeg, f)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.plot(d_ffmpeg)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(d_librosa)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import soundfile as sf\n", + "import numpy as np\n", + "import simpleaudio as sa\n", + "\n", + "# Generate a sine wave\n", + "freq = 440\n", + "duration = 3\n", + "sample_rate = 44100\n", + "t = np.linspace(0, duration, int(duration * sample_rate), False)\n", + "waveform = np.sin(freq * t * 2 * np.pi) * 32767\n", + "\n", + "# Play the audio\n", + "audio_data = waveform.astype(np.int16)\n", + "play_obj = sa.play_buffer(audio_data, 1, 2, sample_rate)\n", + "\n", + "# Wait for the audio to finish playing\n", + "play_obj.wait_done()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import soundfile as sf\n", + "import numpy as np\n", + "import simpleaudio as sa\n", + "\n", + "# Read the WAV file\n", + "filename = \"./Elpaco dataset/akhoe_haikom1/state_hospital.wav\"\n", + "audio_data, sample_rate = sf.read(filename)\n", + "\n", + "# Convert the audio data to the correct format\n", + "audio_data = (audio_data * 32767).astype(np.int16)\n", + "\n", + "# Play the audio\n", + "play_obj = sa.play_buffer(audio_data, 2, 2, sample_rate)\n", + "\n", + "# Wait for the audio to finish playing\n", + "play_obj.wait_done()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo 112.708 113.647\n" + ] + } + ], + "source": [ + "n = 1\n", + "\n", + "audiorow_start = df_audio[\"begin\"][n]\n", + "audiorow_end = df_audio[\"end\"][n]\n", + "audiorow_file = df_audio[\"source\"][n]\n", + "\n", + "print(audiorow_file, audiorow_start, audiorow_end)2\n", + "\n", + "#get_audio_ffmpeg(audiorow_file, audiorow_start, audiorow_end)\n", + "\n", + "ffmpeg -ss 112.708 -to 113.647 -i\n", + "/vol/tensusers2/aliesenfeld/Elpaco\\ dataset/Elpaco\\ dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo\n", + "Name: source, dtype: object\n", + "2 /ambel1/AM064\n", + "Name: source, dtype: object\n", + "3 /ambel1/AM067\n", + "Name: source, dtype: object\n", + "4 /anal1/anm_20160916_PO_Wolring_1\n", + "Name: source, dtype: object\n", + "5 /anal1/anm_20160924_Thotson_grandmothers_1\n", + "Name: source, dtype: object\n", + "6 /anal1/anm_20161014_PO_Darchol_evening_convers...\n", + "Name: source, dtype: object\n", + "7 /anal1/anm_20161014_PO_Ralruwng_family_lunch1\n", + "Name: source, dtype: object\n", + "8 /anal1/anm_20161210_oklu_chatting3\n", + "Name: source, dtype: object\n", + "9 /anal1/anm_20190803_grandmas\n", + "Name: source, dtype: object\n" + ] + }, + { + "data": { + "text/plain": [ + "[None, None, None, None, None, None, None, None, None]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "def get_row_audio(df):\n", + " \"\"\" load audio and grab individual turns\n", + " TODO: for large sparse WAV files, the audio should be loaded only for the turn\n", + " \"\"\"\n", + " print(df[\"source\"])\n", + " #source = df[\"source\"][0]\n", + " #wav_loc = f'./Elpaco dataset{source}.wav'\n", + " #print path to audio file (optional)\n", + " #print(wav_loc)\n", + "\n", + " # # load audio\n", + " # try:\n", + " # #Three options here, comment out as needed:\n", + "\n", + " # #scipy wavfile\n", + " # #rate, data = wavfile.read(wav_loc)\n", + " \n", + " # #librosa\n", + " # rate, data = librosa.load(wav_loc)\n", + " \n", + " # #base python\n", + " # #rate, data = wave.open(wav_loc)\n", + "\n", + " # #explore fourth option: use ffmpeg directly \n", + " # #ffmpeg?\n", + "\n", + " # data = data.astype('float32')\n", + " \n", + " # # get audio for each turn\n", + " # df_audio[\"audio\"] = [\n", + " # data[int(st * rate) : int(et * rate)].copy(deep=True)\n", + " # for st, et in zip(df_audio.begin.values, df_audio.end.values)\n", + " # ]\n", + "\n", + " # df_audio[\"rate\"] = rate\n", + " # except Exception:\n", + " # pass\n", + "\n", + " # return df_audio\n", + "\n", + "[get_row_audio(df_audio[df_audio.source == source]) for source in df_audio.source[1:10]]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/ambel1/AM064.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/ambel1/AM067.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/anal1/anm_20160916_PO_Wolring_1.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/anal1/anm_20160924_Thotson_grandmothers_1.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/anal1/anm_20161014_PO_Darchol_evening_conversation2.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/anal1/anm_20161014_PO_Ralruwng_family_lunch1.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/anal1/anm_20161210_oklu_chatting3.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/anal1/anm_20190803_grandmas.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[\"rate\"] = rate\n" + ] + } + ], + "source": [ + "def get_row_audio(df, wav_loc):\n", + " \"\"\" load audio and grab individual turns\n", + " TODO: for large sparse WAV files, the audio should be loaded only for the turn\n", + " \"\"\"\n", + " \n", + " #print path to audio file (optional)\n", + " print(wav_loc)\n", + "\n", + " # load audio\n", + "\n", + " #Three options here, comment out as needed:\n", + "\n", + " #scipy wavfile\n", + " #rate, data = wavfile.read(wav_loc)\n", + " \n", + " #librosa\n", + " data, rate = librosa.load(wav_loc)\n", + " \n", + " #base python\n", + " #rate, data = wave.open(wav_loc)\n", + "\n", + " #explore fourth option: use ffmpeg directly \n", + " #ffmpeg?\n", + "\n", + " data = data.astype('float32')\n", + " \n", + " # get audio for each turn\n", + " # df_audio[\"audio\"] = [\n", + " # data[int(st * rate) : int(et * rate)].copy()\n", + " # for st, et in zip(df_audio.begin.values, df_audio.end.values)\n", + " # ]\n", + "\n", + " df[\"rate\"] = rate\n", + "\n", + " return df\n", + "\n", + "sounddata = [get_row_audio(df_audio[df_audio.source == source], f'./Elpaco dataset{source}.wav') for source in df_audio.source[:10]]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "33626250" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sounddata)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1313.280\n", + "1 441.654\n", + "2 327.840\n", + "3 391.990\n", + "4 527.017\n", + "5 63.633\n", + "6 152.779\n", + "7 1278.735\n", + "8 1070.298\n", + "9 659.145\n", + "Name: end, dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_audio_sub.iloc[:,3]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loops for execution" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n", + "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", + " -0.00041938]\n", + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", + " -0.00041938]\n", + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", + " -0.00041938]\n", + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", + " -0.00041938]\n", + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", + " -0.00041938]\n", + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", + " -0.00041938]\n", + "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", + " -0.00041938]\n", + "./Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 3.2782555e-07 -4.1872263e-05 -2.2262335e-05 ... -4.3265522e-05\n", + " -1.9449741e-05 -5.1818788e-05]\n", + "./Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 3.2782555e-07 -4.1872263e-05 -2.2262335e-05 ... -4.3265522e-05\n", + " -1.9449741e-05 -5.1818788e-05]\n", + "[ uid language begin end duration \n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", + "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", + "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", + "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", + "\n", + " source participant utterance_stripped utterance \n", + "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", + "1 /akhoe_haikom1/state_hospital tx@Es î î \n", + "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "\n", + " form_ascii rate \n", + "0 i_ 22050 \n", + "1 i_ 22050 \n", + "2 i_ 22050 \n", + "3 i_ 22050 \n", + "4 i_ 22050 \n", + "5 i_ 22050 \n", + "6 i_ 22050 \n", + "7 i_ 22050 , uid language begin end duration \n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", + "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", + "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", + "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", + "\n", + " source participant utterance_stripped utterance \n", + "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", + "1 /akhoe_haikom1/state_hospital tx@Es î î \n", + "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "\n", + " form_ascii rate \n", + "0 i_ 22050 \n", + "1 i_ 22050 \n", + "2 i_ 22050 \n", + "3 i_ 22050 \n", + "4 i_ 22050 \n", + "5 i_ 22050 \n", + "6 i_ 22050 \n", + "7 i_ 22050 , uid language begin end duration \n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", + "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", + "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", + "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", + "\n", + " source participant utterance_stripped utterance \n", + "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", + "1 /akhoe_haikom1/state_hospital tx@Es î î \n", + "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "\n", + " form_ascii rate \n", + "0 i_ 22050 \n", + "1 i_ 22050 \n", + "2 i_ 22050 \n", + "3 i_ 22050 \n", + "4 i_ 22050 \n", + "5 i_ 22050 \n", + "6 i_ 22050 \n", + "7 i_ 22050 , uid language begin end duration \n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", + "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", + "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", + "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", + "\n", + " source participant utterance_stripped utterance \n", + "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", + "1 /akhoe_haikom1/state_hospital tx@Es î î \n", + "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "\n", + " form_ascii rate \n", + "0 i_ 22050 \n", + "1 i_ 22050 \n", + "2 i_ 22050 \n", + "3 i_ 22050 \n", + "4 i_ 22050 \n", + "5 i_ 22050 \n", + "6 i_ 22050 \n", + "7 i_ 22050 , uid language begin end duration \n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", + "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", + "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", + "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", + "\n", + " source participant utterance_stripped utterance \n", + "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", + "1 /akhoe_haikom1/state_hospital tx@Es î î \n", + "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "\n", + " form_ascii rate \n", + "0 i_ 22050 \n", + "1 i_ 22050 \n", + "2 i_ 22050 \n", + "3 i_ 22050 \n", + "4 i_ 22050 \n", + "5 i_ 22050 \n", + "6 i_ 22050 \n", + "7 i_ 22050 , uid language begin end duration \n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", + "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", + "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", + "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", + "\n", + " source participant utterance_stripped utterance \n", + "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", + "1 /akhoe_haikom1/state_hospital tx@Es î î \n", + "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "\n", + " form_ascii rate \n", + "0 i_ 22050 \n", + "1 i_ 22050 \n", + "2 i_ 22050 \n", + "3 i_ 22050 \n", + "4 i_ 22050 \n", + "5 i_ 22050 \n", + "6 i_ 22050 \n", + "7 i_ 22050 , uid language begin end duration \n", + "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", + "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", + "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", + "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", + "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", + "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", + "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", + "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", + "\n", + " source participant utterance_stripped utterance \n", + "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", + "1 /akhoe_haikom1/state_hospital tx@Es î î \n", + "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", + "\n", + " form_ascii rate \n", + "0 i_ 22050 \n", + "1 i_ 22050 \n", + "2 i_ 22050 \n", + "3 i_ 22050 \n", + "4 i_ 22050 \n", + "5 i_ 22050 \n", + "6 i_ 22050 \n", + "7 i_ 22050 , uid language begin end duration \n", + "8 akie-1-030-95822 akie 95822.0 96442.0 620.0 \\\n", + "9 akie-1-033-99525 akie 99525.0 100289.0 764.0 \n", + "10 akie-1-041-112708 akie 112708.0 113647.0 939.0 \n", + "11 akie-1-048-125403 akie 125403.0 125935.0 532.0 \n", + "12 akie-1-074-183638 akie 183638.0 184542.0 904.0 \n", + "13 akie-1-076-186733 akie 186733.0 187438.0 705.0 \n", + "14 akie-1-078-189573 akie 189573.0 190705.0 1132.0 \n", + "15 akie-1-083-198073 akie 198073.0 198794.0 721.0 \n", + "16 akie-1-087-205578 akie 205578.0 206094.0 516.0 \n", + "17 akie-1-103-230912 akie 230912.0 231810.0 898.0 \n", + "18 akie-1-105-234173 akie 234173.0 234831.0 658.0 \n", + "19 akie-1-108-237562 akie 237562.0 238347.0 785.0 \n", + "20 akie-1-127-274810 akie 274810.0 275768.0 958.0 \n", + "21 akie-1-132-285746 akie 285746.0 286489.0 743.0 \n", + "22 akie-1-146-315052 akie 315052.0 316031.0 979.0 \n", + "23 akie-1-149-320678 akie 320678.0 321089.0 411.0 \n", + "24 akie-1-153-327799 akie 327799.0 328542.0 743.0 \n", + "25 akie-1-156-332779 akie 332779.0 333547.0 768.0 \n", + "26 akie-1-172-373624 akie 373624.0 374426.0 802.0 \n", + "27 akie-1-176-380275 akie 380275.0 381003.0 728.0 \n", + "28 akie-1-178-386315 akie 386315.0 386870.0 555.0 \n", + "29 akie-1-180-392001 akie 392001.0 392513.0 512.0 \n", + "30 akie-1-182-395794 akie 395794.0 396578.0 784.0 \n", + "31 akie-1-190-411123 akie 411123.0 411734.0 611.0 \n", + "32 akie-1-193-414528 akie 414528.0 415213.0 685.0 \n", + "33 akie-1-196-417624 akie 417624.0 418247.0 623.0 \n", + "34 akie-1-198-420819 akie 420819.0 421148.0 329.0 \n", + "35 akie-1-209-441166 akie 441166.0 441654.0 488.0 \n", + "36 akie-1-212-449198 akie 449198.0 449791.0 593.0 \n", + "\n", + " source participant \n", + "8 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \\\n", + "9 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "10 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "11 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "12 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "13 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "14 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "15 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "16 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "17 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "18 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "19 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "20 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "21 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "22 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "23 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "24 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "25 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "26 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "27 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "28 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "29 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "30 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "31 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "32 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "33 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "34 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "35 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "36 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "\n", + " utterance_stripped utterance form_ascii rate \n", + "8 aá aá aa_ 22050 \n", + "9 aá aá aa_ 22050 \n", + "10 aá aá aa_ 22050 \n", + "11 aá aá aa_ 22050 \n", + "12 aá aá aa_ 22050 \n", + "13 aá aá aa_ 22050 \n", + "14 aá aá aa_ 22050 \n", + "15 aá aá aa_ 22050 \n", + "16 aá aá aa_ 22050 \n", + "17 aá aá aa_ 22050 \n", + "18 aá aá aa_ 22050 \n", + "19 aá aá aa_ 22050 \n", + "20 aá aá aa_ 22050 \n", + "21 aá aá aa_ 22050 \n", + "22 aá aá aa_ 22050 \n", + "23 aá aá aa_ 22050 \n", + "24 aá aá aa_ 22050 \n", + "25 aá aá aa_ 22050 \n", + "26 aá aá aa_ 22050 \n", + "27 aá aá aa_ 22050 \n", + "28 aá aá aa_ 22050 \n", + "29 aá aá aa_ 22050 \n", + "30 aá aá aa_ 22050 \n", + "31 aá aá aa_ 22050 \n", + "32 aá aá aa_ 22050 \n", + "33 aá aá aa_ 22050 \n", + "34 aá aá aa_ 22050 \n", + "35 aá aá aa_ 22050 \n", + "36 aá aá aa_ 22050 , uid language begin end duration \n", + "8 akie-1-030-95822 akie 95822.0 96442.0 620.0 \\\n", + "9 akie-1-033-99525 akie 99525.0 100289.0 764.0 \n", + "10 akie-1-041-112708 akie 112708.0 113647.0 939.0 \n", + "11 akie-1-048-125403 akie 125403.0 125935.0 532.0 \n", + "12 akie-1-074-183638 akie 183638.0 184542.0 904.0 \n", + "13 akie-1-076-186733 akie 186733.0 187438.0 705.0 \n", + "14 akie-1-078-189573 akie 189573.0 190705.0 1132.0 \n", + "15 akie-1-083-198073 akie 198073.0 198794.0 721.0 \n", + "16 akie-1-087-205578 akie 205578.0 206094.0 516.0 \n", + "17 akie-1-103-230912 akie 230912.0 231810.0 898.0 \n", + "18 akie-1-105-234173 akie 234173.0 234831.0 658.0 \n", + "19 akie-1-108-237562 akie 237562.0 238347.0 785.0 \n", + "20 akie-1-127-274810 akie 274810.0 275768.0 958.0 \n", + "21 akie-1-132-285746 akie 285746.0 286489.0 743.0 \n", + "22 akie-1-146-315052 akie 315052.0 316031.0 979.0 \n", + "23 akie-1-149-320678 akie 320678.0 321089.0 411.0 \n", + "24 akie-1-153-327799 akie 327799.0 328542.0 743.0 \n", + "25 akie-1-156-332779 akie 332779.0 333547.0 768.0 \n", + "26 akie-1-172-373624 akie 373624.0 374426.0 802.0 \n", + "27 akie-1-176-380275 akie 380275.0 381003.0 728.0 \n", + "28 akie-1-178-386315 akie 386315.0 386870.0 555.0 \n", + "29 akie-1-180-392001 akie 392001.0 392513.0 512.0 \n", + "30 akie-1-182-395794 akie 395794.0 396578.0 784.0 \n", + "31 akie-1-190-411123 akie 411123.0 411734.0 611.0 \n", + "32 akie-1-193-414528 akie 414528.0 415213.0 685.0 \n", + "33 akie-1-196-417624 akie 417624.0 418247.0 623.0 \n", + "34 akie-1-198-420819 akie 420819.0 421148.0 329.0 \n", + "35 akie-1-209-441166 akie 441166.0 441654.0 488.0 \n", + "36 akie-1-212-449198 akie 449198.0 449791.0 593.0 \n", + "\n", + " source participant \n", + "8 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \\\n", + "9 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "10 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "11 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "12 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "13 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "14 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "15 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "16 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "17 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "18 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "19 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "20 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "21 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "22 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "23 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "24 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "25 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "26 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "27 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "28 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "29 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "30 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "31 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "32 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "33 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "34 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "35 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "36 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", + "\n", + " utterance_stripped utterance form_ascii rate \n", + "8 aá aá aa_ 22050 \n", + "9 aá aá aa_ 22050 \n", + "10 aá aá aa_ 22050 \n", + "11 aá aá aa_ 22050 \n", + "12 aá aá aa_ 22050 \n", + "13 aá aá aa_ 22050 \n", + "14 aá aá aa_ 22050 \n", + "15 aá aá aa_ 22050 \n", + "16 aá aá aa_ 22050 \n", + "17 aá aá aa_ 22050 \n", + "18 aá aá aa_ 22050 \n", + "19 aá aá aa_ 22050 \n", + "20 aá aá aa_ 22050 \n", + "21 aá aá aa_ 22050 \n", + "22 aá aá aa_ 22050 \n", + "23 aá aá aa_ 22050 \n", + "24 aá aá aa_ 22050 \n", + "25 aá aá aa_ 22050 \n", + "26 aá aá aa_ 22050 \n", + "27 aá aá aa_ 22050 \n", + "28 aá aá aa_ 22050 \n", + "29 aá aá aa_ 22050 \n", + "30 aá aá aa_ 22050 \n", + "31 aá aá aa_ 22050 \n", + "32 aá aá aa_ 22050 \n", + "33 aá aá aa_ 22050 \n", + "34 aá aá aa_ 22050 \n", + "35 aá aá aa_ 22050 \n", + "36 aá aá aa_ 22050 ]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df_audio[\"rate\"] = rate\n" + ] + } + ], + "source": [ + "#execute as for loop\n", + "#[get_row_audio(df_audio[df_audio.source == source], './Elpaco dataset'+ source +'.wav') for source in df_audio.source]\n", + "#[get_sampling_rate(f'./Elpaco dataset{source}.wav') for source in df_audio.source[1:10]]\n", + "\n", + "df_audio_10 = [get_row_audio(df_audio[df_audio.source == source], f'./Elpaco dataset{source}.wav') for source in df_audio.source[1:10]]\n", + "#[get_sampling_rate('./Elpaco dataset'+ source +'.wav') for source in df_audio.source]\n", + "\n", + "print(df_audio_10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "list indices must be integers or slices, not str", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[24], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mprint\u001b[39m(df_audio_10[\u001b[39m\"\u001b[39;49m\u001b[39mrate\u001b[39;49m\u001b[39m\"\u001b[39;49m])\n", + "\u001b[0;31mTypeError\u001b[0m: list indices must be integers or slices, not str" + ] + } + ], + "source": [ + "print(df_audio_10[\"rate\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/139551 [00:00 4\u001b[0m df_audios \u001b[39m=\u001b[39m parallel(\n\u001b[1;32m 5\u001b[0m delayed(get_row_audio)(\n\u001b[1;32m 6\u001b[0m df_audio[df_audio\u001b[39m.\u001b[39;49msource \u001b[39m==\u001b[39;49m source], \n\u001b[1;32m 7\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39m./Elpaco dataset\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m+\u001b[39;49m source \u001b[39m+\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m.wav\u001b[39;49m\u001b[39m'\u001b[39;49m, \u001b[39m#Edit path to corpus directory here\u001b[39;49;00m\n\u001b[1;32m 8\u001b[0m )\n\u001b[1;32m 9\u001b[0m \u001b[39mfor\u001b[39;49;00m source \u001b[39min\u001b[39;49;00m tqdm(df_audio\u001b[39m.\u001b[39;49msource)\n\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m df_audio \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mconcat(df_audios)\n\u001b[1;32m 12\u001b[0m \u001b[39mlen\u001b[39m(df_audio)\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:1088\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 1085\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdispatch_one_batch(iterator):\n\u001b[1;32m 1086\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_iterating \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_original_iterator \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 1088\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdispatch_one_batch(iterator):\n\u001b[1;32m 1089\u001b[0m \u001b[39mpass\u001b[39;00m\n\u001b[1;32m 1091\u001b[0m \u001b[39mif\u001b[39;00m pre_dispatch \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mall\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mor\u001b[39;00m n_jobs \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m 1092\u001b[0m \u001b[39m# The iterable was consumed all at once by the above for loop.\u001b[39;00m\n\u001b[1;32m 1093\u001b[0m \u001b[39m# No need to wait for async callbacks to trigger to\u001b[39;00m\n\u001b[1;32m 1094\u001b[0m \u001b[39m# consumption.\u001b[39;00m\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:901\u001b[0m, in \u001b[0;36mParallel.dispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 900\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 901\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_dispatch(tasks)\n\u001b[1;32m 902\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mTrue\u001b[39;00m\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:819\u001b[0m, in \u001b[0;36mParallel._dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_lock:\n\u001b[1;32m 818\u001b[0m job_idx \u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_jobs)\n\u001b[0;32m--> 819\u001b[0m job \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_backend\u001b[39m.\u001b[39;49mapply_async(batch, callback\u001b[39m=\u001b[39;49mcb)\n\u001b[1;32m 820\u001b[0m \u001b[39m# A job can complete so quickly than its callback is\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[39m# called before we get here, causing self._jobs to\u001b[39;00m\n\u001b[1;32m 822\u001b[0m \u001b[39m# grow. To ensure correct results ordering, .insert is\u001b[39;00m\n\u001b[1;32m 823\u001b[0m \u001b[39m# used (rather than .append) in the following line\u001b[39;00m\n\u001b[1;32m 824\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_jobs\u001b[39m.\u001b[39minsert(job_idx, job)\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/_parallel_backends.py:208\u001b[0m, in \u001b[0;36mSequentialBackend.apply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mapply_async\u001b[39m(\u001b[39mself\u001b[39m, func, callback\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m 207\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Schedule a func to be run\"\"\"\u001b[39;00m\n\u001b[0;32m--> 208\u001b[0m result \u001b[39m=\u001b[39m ImmediateResult(func)\n\u001b[1;32m 209\u001b[0m \u001b[39mif\u001b[39;00m callback:\n\u001b[1;32m 210\u001b[0m callback(result)\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/_parallel_backends.py:597\u001b[0m, in \u001b[0;36mImmediateResult.__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, batch):\n\u001b[1;32m 595\u001b[0m \u001b[39m# Don't delay the application, to avoid keeping the input\u001b[39;00m\n\u001b[1;32m 596\u001b[0m \u001b[39m# arguments in memory\u001b[39;00m\n\u001b[0;32m--> 597\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresults \u001b[39m=\u001b[39m batch()\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:288\u001b[0m, in \u001b[0;36mBatchedCalls.__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 285\u001b[0m \u001b[39m# Set the default nested backend to self._backend but do not set the\u001b[39;00m\n\u001b[1;32m 286\u001b[0m \u001b[39m# change the default number of processes to -1\u001b[39;00m\n\u001b[1;32m 287\u001b[0m \u001b[39mwith\u001b[39;00m parallel_backend(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backend, n_jobs\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_n_jobs):\n\u001b[0;32m--> 288\u001b[0m \u001b[39mreturn\u001b[39;00m [func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 289\u001b[0m \u001b[39mfor\u001b[39;00m func, args, kwargs \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mitems]\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:288\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 285\u001b[0m \u001b[39m# Set the default nested backend to self._backend but do not set the\u001b[39;00m\n\u001b[1;32m 286\u001b[0m \u001b[39m# change the default number of processes to -1\u001b[39;00m\n\u001b[1;32m 287\u001b[0m \u001b[39mwith\u001b[39;00m parallel_backend(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backend, n_jobs\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_n_jobs):\n\u001b[0;32m--> 288\u001b[0m \u001b[39mreturn\u001b[39;00m [func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 289\u001b[0m \u001b[39mfor\u001b[39;00m func, args, kwargs \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mitems]\n", + "Cell \u001b[0;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_row_audio\u001b[0;34m(df_audio, wav_loc)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[39m#print path to audio file (optional)\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[39mprint\u001b[39m(wav_loc)\n\u001b[0;32m----> 9\u001b[0m data, rate \u001b[39m=\u001b[39m librosa\u001b[39m.\u001b[39;49mload(wav_loc)\n\u001b[1;32m 10\u001b[0m data \u001b[39m=\u001b[39m data\u001b[39m.\u001b[39mastype(\u001b[39m'\u001b[39m\u001b[39mfloat32\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 11\u001b[0m \u001b[39mprint\u001b[39m(data)\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/librosa/core/audio.py:190\u001b[0m, in \u001b[0;36mload\u001b[0;34m(path, sr, mono, offset, duration, dtype, res_type)\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[39m# Final cleanup for dtype and contiguity\u001b[39;00m\n\u001b[1;32m 189\u001b[0m \u001b[39mif\u001b[39;00m mono:\n\u001b[0;32m--> 190\u001b[0m y \u001b[39m=\u001b[39m to_mono(y)\n\u001b[1;32m 192\u001b[0m \u001b[39mif\u001b[39;00m sr \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 193\u001b[0m y \u001b[39m=\u001b[39m resample(y, orig_sr\u001b[39m=\u001b[39msr_native, target_sr\u001b[39m=\u001b[39msr, res_type\u001b[39m=\u001b[39mres_type)\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/librosa/core/audio.py:513\u001b[0m, in \u001b[0;36mto_mono\u001b[0;34m(y)\u001b[0m\n\u001b[1;32m 510\u001b[0m util\u001b[39m.\u001b[39mvalid_audio(y, mono\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 512\u001b[0m \u001b[39mif\u001b[39;00m y\u001b[39m.\u001b[39mndim \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m--> 513\u001b[0m y \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mmean(y, axis\u001b[39m=\u001b[39;49m\u001b[39mtuple\u001b[39;49m(\u001b[39mrange\u001b[39;49m(y\u001b[39m.\u001b[39;49mndim \u001b[39m-\u001b[39;49m \u001b[39m1\u001b[39;49m)))\n\u001b[1;32m 515\u001b[0m \u001b[39mreturn\u001b[39;00m y\n", + "File \u001b[0;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mmean\u001b[0;34m(*args, **kwargs)\u001b[0m\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3464\u001b[0m, in \u001b[0;36mmean\u001b[0;34m(a, axis, dtype, out, keepdims, where)\u001b[0m\n\u001b[1;32m 3461\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 3462\u001b[0m \u001b[39mreturn\u001b[39;00m mean(axis\u001b[39m=\u001b[39maxis, dtype\u001b[39m=\u001b[39mdtype, out\u001b[39m=\u001b[39mout, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m-> 3464\u001b[0m \u001b[39mreturn\u001b[39;00m _methods\u001b[39m.\u001b[39;49m_mean(a, axis\u001b[39m=\u001b[39;49maxis, dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[1;32m 3465\u001b[0m out\u001b[39m=\u001b[39;49mout, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", + "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/numpy/core/_methods.py:181\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims, where)\u001b[0m\n\u001b[1;32m 178\u001b[0m dtype \u001b[39m=\u001b[39m mu\u001b[39m.\u001b[39mdtype(\u001b[39m'\u001b[39m\u001b[39mf4\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 179\u001b[0m is_float16_result \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 181\u001b[0m ret \u001b[39m=\u001b[39m umr_sum(arr, axis, dtype, out, keepdims, where\u001b[39m=\u001b[39;49mwhere)\n\u001b[1;32m 182\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(ret, mu\u001b[39m.\u001b[39mndarray):\n\u001b[1;32m 183\u001b[0m \u001b[39mwith\u001b[39;00m _no_nep50_warning():\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "#execute using parallel \n", + "\n", + "with Parallel(n_jobs=1, verbose=verbosity) as parallel:\n", + " df_audios = parallel(\n", + " delayed(get_row_audio)(\n", + " df_audio[df_audio.source == source], \n", + " './Elpaco dataset'+ source +'.wav', #Edit path to corpus directory here\n", + " )\n", + " for source in tqdm(df_audio.source)\n", + " )\n", + "df_audio = pd.concat(df_audios)\n", + "len(df_audio)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normalize audio (optional)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# normalize audio if needed\n", + "df_audio['audio'] = [librosa.util.normalize(i) for i in df_audio.audio.values]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot `audio` column waveforms" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▍ | 11/240 [00:00<00:00, 254.29it/s]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# plot some example audio \n", + "nrows = 20\n", + "ncols = 12\n", + "zoom = 2\n", + "fig, axs = plt.subplots(ncols=ncols, nrows = nrows,figsize = (ncols*zoom, nrows+zoom/1.5))\n", + "for i, turn in tqdm(enumerate(df_audio['audio'].values), total = nrows*ncols):\n", + " ax = axs.flatten()[i]\n", + " ax.plot(turn)\n", + " if i == nrows*ncols -1:\n", + " break" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save as `csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df_audio' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/u517177/3.1_Make_audio_and_spec_cols.ipynb Cell 16\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_audio\u001b[39m.\u001b[39mto_csv(\u001b[39m\"\u001b[39m\u001b[39mdf_audio.csv\u001b[39m\u001b[39m\"\u001b[39m, index\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m df_audio\n", + "\u001b[0;31mNameError\u001b[0m: name 'df_audio' is not defined" + ] + } + ], + "source": [ + "df_audio.to_csv(\"df_audio.csv\", index=False)\n", + "df_audio" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "46fbe6e04536e1e35bdd7bb388ca2958a28e6ed66e8c3e8ff3ca43c0ea6b4829" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c5e9ca1c0067ef08e3c426e5ad0e95d7b2a98135 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 9 Jun 2023 17:19:27 +0200 Subject: [PATCH 11/12] combine sample rate and audio extraction --- sktalk/read_audio.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/sktalk/read_audio.py b/sktalk/read_audio.py index 57ad989..b549288 100644 --- a/sktalk/read_audio.py +++ b/sktalk/read_audio.py @@ -2,7 +2,8 @@ import json import numpy as np -def get_sampling_rate(file_path): + +def load_audio(file_path): cmd = [ "ffprobe", "-v", "quiet", @@ -14,23 +15,18 @@ def get_sampling_rate(file_path): result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = json.loads(result.stdout) + sample_rate = None for stream in output["streams"]: if stream["codec_type"] == "audio": - return int(stream["sample_rate"]) - - raise ValueError("No audio stream found in the file") - -# Replace 'list_audio_3_40_balanced.wav' with the path to your audio file -# file_path = './Elpaco dataset/akhoe_haikom1/state_hospital.wav' - -# sampling_rate = get_sampling_rate(file_path) -# print(sampling_rate) + sample_rate = stream["sample_rate"] + no_channels = stream["channels"] #TODO the channels need to be preserved + if sample_rate is None: + raise ValueError("No audio stream found in the file") -def get_audio_ffmpeg(file_path): cmd = ["ffmpeg", "-i", file_path, '-f', 's16le', '-acodec', 'pcm_s16le', - '-ar', '22050', + '-ar', sample_rate, '-ac', '1', '-'] @@ -39,4 +35,4 @@ def get_audio_ffmpeg(file_path): audio_array = np.frombuffer(raw_audio, dtype="int16") audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max - return audio_array + return audio_array, int(sample_rate) \ No newline at end of file From ad47bfb446ec2f92280c9f808a4760d4113b4d23 Mon Sep 17 00:00:00 2001 From: Barbara Vreede Date: Fri, 9 Jun 2023 17:20:13 +0200 Subject: [PATCH 12/12] update audio nb --- notebooks/Make_audio_column_Barbara.ipynb | 2791 +-------------------- 1 file changed, 75 insertions(+), 2716 deletions(-) diff --git a/notebooks/Make_audio_column_Barbara.ipynb b/notebooks/Make_audio_column_Barbara.ipynb index 4df6dd7..eb033c7 100644 --- a/notebooks/Make_audio_column_Barbara.ipynb +++ b/notebooks/Make_audio_column_Barbara.ipynb @@ -1,22 +1,8 @@ { "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "RUcScYxiyCLa" - }, - "source": [ - "### To do list\n", - "todo:\n", - "- make debug function that logs errors\n", - "- make mel matrix \n", - "- make spec col" - ] - }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -26,2780 +12,153 @@ "sys.path.insert(0, \"../\")\n" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "kF0UisGqTpR5" - }, - "source": [ - "### Set working directory" - ] - }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "223EdHm_Tnod", - "outputId": "96e4f251-729d-4d90-c914-45e5c7cdfd82" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Current working directory: /home/bvreede\n" - ] - } - ], - "source": [ - "# Import the os module\n", - "import os\n", - "\n", - "# Print the current working directory\n", - "print(\"Current working directory: {0}\".format(os.getcwd()))\n", - "\n", - "# Change the current working directory\n", - "datadir = ('/vol/tensusers2/aliesenfeld/Elpaco dataset/')\n", - "#packagedir = ('vol/)\n", - "#os.chdir('/Users/u517177/continuer_paper/')\n", - "\n", - "\n", - "# Print the current working directory\n", - "#print(\"Current working directory: {0}\".format(os.getcwd()))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", + "execution_count": 67, "metadata": {}, + "outputs": [], "source": [ - "# Part 1: audio extraction" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "u-YN_F_oaAgz" - }, - "source": [ - "### Dependencies\n" + "import librosa\n", + "from sktalk import read_audio\n", + "import IPython.display as ipd" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 86, "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "dXTbO87ZZ_ao" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_509310/67386651.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from tqdm.autonotebook import tqdm\n" - ] - } - ], "source": [ - "#on M1 mac librosa needs to be installed via miniforge\n", - "#!conda install -c conda-forge librosa\n", "import subprocess\n", "import json\n", - "import sounddevice as sd\n", + "import numpy as np\n", "\n", - "import wave\n", - "import IPython\n", - "import librosa\n", - "import pandas as pd\n", - "from scipy.io import wavfile\n", - "import matplotlib.pyplot as plt\n", - "from tqdm.autonotebook import tqdm\n", - "from joblib import Parallel, delayed\n", - "n_jobs = -1; verbosity = 10" + "\n", + "def load_audio(file_path):\n", + " cmd = [\n", + " \"ffprobe\",\n", + " \"-v\", \"quiet\",\n", + " \"-print_format\", \"json\",\n", + " \"-show_streams\",\n", + " file_path\n", + " ]\n", + "\n", + " result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n", + " output = json.loads(result.stdout)\n", + "\n", + " sample_rate = None\n", + " for stream in output[\"streams\"]:\n", + " if stream[\"codec_type\"] == \"audio\":\n", + " sample_rate = stream[\"sample_rate\"]\n", + " no_channels = str(stream[\"channels\"])\n", + "\n", + " if sample_rate is None:\n", + " raise ValueError(\"No audio stream found in the file\")\n", + "\n", + " cmd = [\"ffmpeg\", \"-i\", file_path, '-f', 's16le',\n", + " '-acodec', 'pcm_s16le',\n", + " '-ar', sample_rate,\n", + " '-ac', no_channels,\n", + " '-']\n", + "\n", + " print((' ').join(cmd))\n", + " pipe = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", + " raw_audio = pipe.stdout\n", + " audio_array = np.frombuffer(raw_audio, dtype=\"int16\")\n", + " audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max\n", + "\n", + " return audio_array, int(sample_rate)\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[0m\u001b[01;34m'Elpaco dataset'\u001b[0m/ \u001b[01;34moutput\u001b[0m/ streaks_10perlang.csv\n", - "\u001b[01;34m'continuer data'\u001b[0m/ \u001b[01;34moutput_trail\u001b[0m/ streaks_1perlang.csv\n", - " continuers_for_audio.csv \u001b[01;34mstreaks\u001b[0m/ streaks_50perlang.csv\n", - " discontinuers_for_audio.csv streaks.csv zic1z6cK\n" + "ffmpeg -i ../data/catalan_demo.wav -f s16le -acodec pcm_s16le -ar 16000 -ac 2 -\n", + "22050 16000\n" ] } ], "source": [ - "#install tensorflow if needed\n", + "wavaddress = \"../data/catalan_demo.wav\"\n", "\n", - "#!conda install -c conda-forge tensorflow -y\n", - "#!pip install tensorflow-macos" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inspect single audio files" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: './Elpaco dataset/akhoe_haikom1/state_hospital.wav'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m#Load and inspect single files\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m audiofile \u001b[39m=\u001b[39m wave\u001b[39m.\u001b[39;49mopen(\u001b[39m'\u001b[39;49m\u001b[39m./Elpaco dataset/akhoe_haikom1/state_hospital.wav\u001b[39;49m\u001b[39m'\u001b[39;49m,\u001b[39m'\u001b[39;49m\u001b[39mr\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m( \u001b[39m\"\u001b[39m\u001b[39mNumber of channels\u001b[39m\u001b[39m\"\u001b[39m,audiofile\u001b[39m.\u001b[39mgetnchannels())\n\u001b[1;32m 5\u001b[0m \u001b[39mprint\u001b[39m ( \u001b[39m\"\u001b[39m\u001b[39mSample width\u001b[39m\u001b[39m\"\u001b[39m,audiofile\u001b[39m.\u001b[39mgetsampwidth())\n", - "File \u001b[0;32m/usr/lib/python3.8/wave.py:510\u001b[0m, in \u001b[0;36mopen\u001b[0;34m(f, mode)\u001b[0m\n\u001b[1;32m 508\u001b[0m mode \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 509\u001b[0m \u001b[39mif\u001b[39;00m mode \u001b[39min\u001b[39;00m (\u001b[39m'\u001b[39m\u001b[39mr\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m):\n\u001b[0;32m--> 510\u001b[0m \u001b[39mreturn\u001b[39;00m Wave_read(f)\n\u001b[1;32m 511\u001b[0m \u001b[39melif\u001b[39;00m mode \u001b[39min\u001b[39;00m (\u001b[39m'\u001b[39m\u001b[39mw\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mwb\u001b[39m\u001b[39m'\u001b[39m):\n\u001b[1;32m 512\u001b[0m \u001b[39mreturn\u001b[39;00m Wave_write(f)\n", - "File \u001b[0;32m/usr/lib/python3.8/wave.py:160\u001b[0m, in \u001b[0;36mWave_read.__init__\u001b[0;34m(self, f)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_i_opened_the_file \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 159\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(f, \u001b[39mstr\u001b[39m):\n\u001b[0;32m--> 160\u001b[0m f \u001b[39m=\u001b[39m builtins\u001b[39m.\u001b[39;49mopen(f, \u001b[39m'\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m 161\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_i_opened_the_file \u001b[39m=\u001b[39m f\n\u001b[1;32m 162\u001b[0m \u001b[39m# else, assume it is an open file object already\u001b[39;00m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './Elpaco dataset/akhoe_haikom1/state_hospital.wav'" - ] - } - ], - "source": [ - "#Load and inspect single files\n", - "audiofile = wave.open('./Elpaco dataset/akhoe_haikom1/state_hospital.wav','r')\n", + "d_ffmpeg, r_ffmpeg = load_audio(wavaddress)\n", + "d_librosa, r_librosa = librosa.load(wavaddress)\n", "\n", - "print( \"Number of channels\",audiofile.getnchannels())\n", - "print ( \"Sample width\",audiofile.getsampwidth())\n", - "print ( \"Frame rate.\",audiofile.getframerate())\n", - "print (\"Number of frames\",audiofile.getnframes())\n", - "print ( \"parameters:\",audiofile.getparams())" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "#play audio files\n", - "#IPython.display.Audio('./Elpaco dataset/akhoe_haikom1/state_hospital.wav')\n", - "\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### read csv" + "print(r_librosa, r_ffmpeg)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
uidlanguagebeginenddurationsourceparticipantutterance_strippedutteranceform_ascii
0akhoe_haikom-2-484-1291852akhoe_haikom1291.8521292.192340.0/akhoe_haikom1/state_hospitaltx@Gaîîi_
1akie-1-212-449198akie449.198449.791593.0/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyoBaa_
2ambel-3-509-809080ambel809.080810.020940.0/ambel1/AM064ESD_Transcription-txt-wgooooooo
3ambel-4-125-205440ambel205.440206.100660.0/ambel1/AM067WG_Transcription-txt-wgommmmmm
4anal-01-283-507370anal507.370507.890520.0/anal1/anm_20160916_PO_Wolring_1Anal sp2mmmmmm
5anal-05-029-60356anal60.35661.126770.0/anal1/anm_20160924_Thotson_grandmothers_1Anal sp2inginging
6anal-08-437-607867anal607.867608.365498.0/anal1/anm_20161014_PO_Darchol_evening_convers...anal speaker 4ummummumm
7anal-09-264-932963anal932.963934.3831420.0/anal1/anm_20161014_PO_Ralruwng_family_lunch1Anal sp1ummummumm
8anal-12-577-1069261anal1069.2611070.2981037.0/anal1/anm_20161210_oklu_chatting3Anal sp 5ummummumm
9anal-13-047-69501anal69.50170.127626.0/anal1/anm_20190803_grandmasanal speaker 1amatoamatoamato
\n", - "
" + " \n", + " " ], "text/plain": [ - " uid language begin end duration \n", - "0 akhoe_haikom-2-484-1291852 akhoe_haikom 1291.852 1292.192 340.0 \\\n", - "1 akie-1-212-449198 akie 449.198 449.791 593.0 \n", - "2 ambel-3-509-809080 ambel 809.080 810.020 940.0 \n", - "3 ambel-4-125-205440 ambel 205.440 206.100 660.0 \n", - "4 anal-01-283-507370 anal 507.370 507.890 520.0 \n", - "5 anal-05-029-60356 anal 60.356 61.126 770.0 \n", - "6 anal-08-437-607867 anal 607.867 608.365 498.0 \n", - "7 anal-09-264-932963 anal 932.963 934.383 1420.0 \n", - "8 anal-12-577-1069261 anal 1069.261 1070.298 1037.0 \n", - "9 anal-13-047-69501 anal 69.501 70.127 626.0 \n", - "\n", - " source \n", - "0 /akhoe_haikom1/state_hospital \\\n", - "1 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo \n", - "2 /ambel1/AM064 \n", - "3 /ambel1/AM067 \n", - "4 /anal1/anm_20160916_PO_Wolring_1 \n", - "5 /anal1/anm_20160924_Thotson_grandmothers_1 \n", - "6 /anal1/anm_20161014_PO_Darchol_evening_convers... \n", - "7 /anal1/anm_20161014_PO_Ralruwng_family_lunch1 \n", - "8 /anal1/anm_20161210_oklu_chatting3 \n", - "9 /anal1/anm_20190803_grandmas \n", - "\n", - " participant utterance_stripped utterance form_ascii \n", - "0 tx@Ga î î i_ \n", - "1 B aá aá aa_ \n", - "2 ESD_Transcription-txt-wgo oo oo oo \n", - "3 WG_Transcription-txt-wgo mm mm mm \n", - "4 Anal sp2 mm mm mm \n", - "5 Anal sp2 ing ing ing \n", - "6 anal speaker 4 umm umm umm \n", - "7 Anal sp1 umm umm umm \n", - "8 Anal sp 5 umm umm umm \n", - "9 anal speaker 1 amato amato amato " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#read csv \n", - "datasource = datadir + 'continuers_for_audio.csv'\n", - "\n", - "df_audio = pd.read_csv(datasource)\n", - "#Optional:\n", - "#check timestamp format and .div is needed to match audio extaction function requirements\n", - "df_audio['begin'] = df_audio['begin'].div(1000)\n", - "df_audio['end'] = df_audio['end'].div(1000)\n", - "\n", - "#drop czech because it has separate speaker channels\n", - "df_audio = df_audio[df_audio.language != \"czech\"]\n", - "\n", - "#exclude corpora without audio files\n", - "df_audio = df_audio[df_audio.language != \"italian\"]\n", - "df_audio = df_audio[df_audio.language != \"zacatepec_chatino\"]\n", - "\n", - "#drop files without audio\n", - "\n", - "df_audio.dropna(subset=['begin','end'], inplace=True)\n", - "df_audio = df_audio.groupby(['source']).apply(lambda x: x.sample(1, replace=True)).reset_index(drop=True)\n", - "\n", - "df_audio_sub = df_audio[:10]\n", - "df_audio_sub" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `read_audio` function" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'sktalk'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msktalk\u001b[39;00m \u001b[39mimport\u001b[39;00m read_audio\n\u001b[1;32m 3\u001b[0m \u001b[39m# def get_sampling_rate(file_path):\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[39m# cmd = [\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[39m# \"ffprobe\",\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 39\u001b[0m \n\u001b[1;32m 40\u001b[0m \u001b[39m# return audio_array\u001b[39;00m\n\u001b[1;32m 43\u001b[0m wavaddress \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m/vol/tensusers2/aliesenfeld/Elpaco dataset/Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\u001b[39m\u001b[39m\"\u001b[39m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sktalk'" - ] - } - ], - "source": [ - "from sktalk import read_audio\n", - "\n", - "# def get_sampling_rate(file_path):\n", - "# cmd = [\n", - "# \"ffprobe\",\n", - "# \"-v\", \"quiet\",\n", - "# \"-print_format\", \"json\",\n", - "# \"-show_streams\",\n", - "# file_path\n", - "# ]\n", - "\n", - "# result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n", - "# output = json.loads(result.stdout)\n", - "\n", - "# for stream in output[\"streams\"]:\n", - "# if stream[\"codec_type\"] == \"audio\":\n", - "# return int(stream[\"sample_rate\"])\n", - "\n", - "# raise ValueError(\"No audio stream found in the file\")\n", - "\n", - "# # Replace 'list_audio_3_40_balanced.wav' with the path to your audio file\n", - "# # file_path = './Elpaco dataset/akhoe_haikom1/state_hospital.wav'\n", - "\n", - "# # sampling_rate = get_sampling_rate(file_path)\n", - "# # print(sampling_rate)\n", - "\n", - "\n", - "# def get_audio_ffmpeg(file_path):\n", - "# cmd = [\"ffmpeg\", \"-i\", file_path, '-f', 's16le',\n", - "# '-acodec', 'pcm_s16le',\n", - "# '-ar', '22050',\n", - "# '-ac', '1',\n", - "# '-']\n", - "\n", - "# pipe = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", - "# raw_audio = pipe.stdout\n", - "# audio_array = np.frombuffer(raw_audio, dtype=\"int16\")\n", - "# audio_array = audio_array.astype(np.float32) / np.iinfo(np.int16).max\n", - "\n", - "# return audio_array\n", - "\n", - "\n", - "wavaddress = \"/vol/tensusers2/aliesenfeld/Elpaco dataset/Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\"\n", - "d_ffmpeg = read_audio.get_audio_ffmpeg(wavaddress)\n", - "r_ffmpeg = read_audio.get_sampling_rate(wavaddress)\n", - "\n", - "#ffmpeg -ss start_second -to end_second -i input.mp3 output.mp3\n", - "\n", - "d_librosa, r_librosa = librosa.load(wavaddress)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.0000000e+00 -3.0518509e-05 -3.0518509e-05 -3.0518509e-05\n", - " -6.1037019e-05 -9.1555528e-05 -1.2207404e-04 -1.5259255e-04\n", - " -2.1362957e-04 -2.4414808e-04]\n", - "[ 3.2782555e-07 -4.1872263e-05 -2.2262335e-05 -4.6558678e-05\n", - " -5.6192279e-05 -8.2075596e-05 -1.2717396e-04 -1.5640259e-04\n", - " -2.0396709e-04 -2.4517626e-04]\n", - "24007680\n", - "24007680\n", - "1.0\n", - "1.2539232\n", - "-1.0000305\n", - "-1.0577291\n", - "44100\n", - "22050\n" - ] - } - ], - "source": [ - "print(d_ffmpeg[:10])\n", - "print(d_librosa[:10])\n", - "\n", - "print(len(d_ffmpeg))\n", - "print(len(d_librosa))\n", - "\n", - "print(max(d_ffmpeg))\n", - "print(max(d_librosa))\n", - "\n", - "print(min(d_ffmpeg))\n", - "print(min(d_librosa))\n", - "\n", - "\n", - "print(r_ffmpeg)\n", - "print(r_librosa)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "\n", - "with open(\"../barbara_test/d_librosa.pickle\", \"wb\") as f:\n", - " pickle.dump(d_librosa, f)\n", - "\n", - "with open(\"../barbara_test/r_librosa.pickle\", \"wb\") as f:\n", - " pickle.dump(r_librosa, f)\n", - "\n", - "with open(\"../barbara_test/d_ffmpeg.pickle\", \"wb\") as f:\n", - " pickle.dump(d_ffmpeg, f)\n", - "\n", - "with open(\"../barbara_test/r_ffmpeg.pickle\", \"wb\") as f:\n", - " pickle.dump(r_ffmpeg, f)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" + "" ] }, - "execution_count": 7, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "plt.plot(d_ffmpeg)" + "ipd.Audio(wavaddress) # load a local WAV file" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 88, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "\n", + " \n", + " " + ], "text/plain": [ - "[]" + "" ] }, - "execution_count": 8, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ - "plt.plot(d_librosa)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "import soundfile as sf\n", - "import numpy as np\n", - "import simpleaudio as sa\n", - "\n", - "# Generate a sine wave\n", - "freq = 440\n", - "duration = 3\n", - "sample_rate = 44100\n", - "t = np.linspace(0, duration, int(duration * sample_rate), False)\n", - "waveform = np.sin(freq * t * 2 * np.pi) * 32767\n", - "\n", - "# Play the audio\n", - "audio_data = waveform.astype(np.int16)\n", - "play_obj = sa.play_buffer(audio_data, 1, 2, sample_rate)\n", - "\n", - "# Wait for the audio to finish playing\n", - "play_obj.wait_done()" + "ipd.Audio(d_ffmpeg, rate=r_ffmpeg)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import soundfile as sf\n", - "import numpy as np\n", - "import simpleaudio as sa\n", - "\n", - "# Read the WAV file\n", - "filename = \"./Elpaco dataset/akhoe_haikom1/state_hospital.wav\"\n", - "audio_data, sample_rate = sf.read(filename)\n", - "\n", - "# Convert the audio data to the correct format\n", - "audio_data = (audio_data * 32767).astype(np.int16)\n", - "\n", - "# Play the audio\n", - "play_obj = sa.play_buffer(audio_data, 2, 2, sample_rate)\n", - "\n", - "# Wait for the audio to finish playing\n", - "play_obj.wait_done()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo 112.708 113.647\n" - ] - } - ], - "source": [ - "n = 1\n", - "\n", - "audiorow_start = df_audio[\"begin\"][n]\n", - "audiorow_end = df_audio[\"end\"][n]\n", - "audiorow_file = df_audio[\"source\"][n]\n", - "\n", - "print(audiorow_file, audiorow_start, audiorow_end)2\n", - "\n", - "#get_audio_ffmpeg(audiorow_file, audiorow_start, audiorow_end)\n", - "\n", - "ffmpeg -ss 112.708 -to 113.647 -i\n", - "/vol/tensusers2/aliesenfeld/Elpaco\\ dataset/Elpaco\\ dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo\n", - "Name: source, dtype: object\n", - "2 /ambel1/AM064\n", - "Name: source, dtype: object\n", - "3 /ambel1/AM067\n", - "Name: source, dtype: object\n", - "4 /anal1/anm_20160916_PO_Wolring_1\n", - "Name: source, dtype: object\n", - "5 /anal1/anm_20160924_Thotson_grandmothers_1\n", - "Name: source, dtype: object\n", - "6 /anal1/anm_20161014_PO_Darchol_evening_convers...\n", - "Name: source, dtype: object\n", - "7 /anal1/anm_20161014_PO_Ralruwng_family_lunch1\n", - "Name: source, dtype: object\n", - "8 /anal1/anm_20161210_oklu_chatting3\n", - "Name: source, dtype: object\n", - "9 /anal1/anm_20190803_grandmas\n", - "Name: source, dtype: object\n" - ] - }, - { - "data": { - "text/plain": [ - "[None, None, None, None, None, None, None, None, None]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "def get_row_audio(df):\n", - " \"\"\" load audio and grab individual turns\n", - " TODO: for large sparse WAV files, the audio should be loaded only for the turn\n", - " \"\"\"\n", - " print(df[\"source\"])\n", - " #source = df[\"source\"][0]\n", - " #wav_loc = f'./Elpaco dataset{source}.wav'\n", - " #print path to audio file (optional)\n", - " #print(wav_loc)\n", "\n", - " # # load audio\n", - " # try:\n", - " # #Three options here, comment out as needed:\n", - "\n", - " # #scipy wavfile\n", - " # #rate, data = wavfile.read(wav_loc)\n", - " \n", - " # #librosa\n", - " # rate, data = librosa.load(wav_loc)\n", - " \n", - " # #base python\n", - " # #rate, data = wave.open(wav_loc)\n", - "\n", - " # #explore fourth option: use ffmpeg directly \n", - " # #ffmpeg?\n", - "\n", - " # data = data.astype('float32')\n", - " \n", - " # # get audio for each turn\n", - " # df_audio[\"audio\"] = [\n", - " # data[int(st * rate) : int(et * rate)].copy(deep=True)\n", - " # for st, et in zip(df_audio.begin.values, df_audio.end.values)\n", - " # ]\n", - "\n", - " # df_audio[\"rate\"] = rate\n", - " # except Exception:\n", - " # pass\n", - "\n", - " # return df_audio\n", - "\n", - "[get_row_audio(df_audio[df_audio.source == source]) for source in df_audio.source[1:10]]\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/ambel1/AM064.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/ambel1/AM067.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/anal1/anm_20160916_PO_Wolring_1.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/anal1/anm_20160924_Thotson_grandmothers_1.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/anal1/anm_20161014_PO_Darchol_evening_conversation2.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/anal1/anm_20161014_PO_Ralruwng_family_lunch1.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/anal1/anm_20161210_oklu_chatting3.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/anal1/anm_20190803_grandmas.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2478552/3405997377.py:33: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df[\"rate\"] = rate\n" - ] - } - ], - "source": [ - "def get_row_audio(df, wav_loc):\n", - " \"\"\" load audio and grab individual turns\n", - " TODO: for large sparse WAV files, the audio should be loaded only for the turn\n", - " \"\"\"\n", - " \n", - " #print path to audio file (optional)\n", - " print(wav_loc)\n", - "\n", - " # load audio\n", - "\n", - " #Three options here, comment out as needed:\n", - "\n", - " #scipy wavfile\n", - " #rate, data = wavfile.read(wav_loc)\n", - " \n", - " #librosa\n", - " data, rate = librosa.load(wav_loc)\n", - " \n", - " #base python\n", - " #rate, data = wave.open(wav_loc)\n", - "\n", - " #explore fourth option: use ffmpeg directly \n", - " #ffmpeg?\n", - "\n", - " data = data.astype('float32')\n", - " \n", - " # get audio for each turn\n", - " # df_audio[\"audio\"] = [\n", - " # data[int(st * rate) : int(et * rate)].copy()\n", - " # for st, et in zip(df_audio.begin.values, df_audio.end.values)\n", - " # ]\n", - "\n", - " df[\"rate\"] = rate\n", - "\n", - " return df\n", - "\n", - "sounddata = [get_row_audio(df_audio[df_audio.source == source], f'./Elpaco dataset{source}.wav') for source in df_audio.source[:10]]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "33626250" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(sounddata)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1313.280\n", - "1 441.654\n", - "2 327.840\n", - "3 391.990\n", - "4 527.017\n", - "5 63.633\n", - "6 152.779\n", - "7 1278.735\n", - "8 1070.298\n", - "9 659.145\n", - "Name: end, dtype: float64" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_audio_sub.iloc[:,3]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loops for execution" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n", - "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", - " -0.00041938]\n", - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", - " -0.00041938]\n", - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", - " -0.00041938]\n", - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", - " -0.00041938]\n", - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", - " -0.00041938]\n", - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", - " -0.00041938]\n", - "./Elpaco dataset/akhoe_haikom1/state_hospital.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.0135576 0.00900845 -0.00217441 ... -0.00039207 -0.00038838\n", - " -0.00041938]\n", - "./Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 3.2782555e-07 -4.1872263e-05 -2.2262335e-05 ... -4.3265522e-05\n", - " -1.9449741e-05 -5.1818788e-05]\n", - "./Elpaco dataset/akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo.wav\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 3.2782555e-07 -4.1872263e-05 -2.2262335e-05 ... -4.3265522e-05\n", - " -1.9449741e-05 -5.1818788e-05]\n", - "[ uid language begin end duration \n", - "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", - "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", - "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", - "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", - "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", - "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", - "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", - "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", - "\n", - " source participant utterance_stripped utterance \n", - "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", - "1 /akhoe_haikom1/state_hospital tx@Es î î \n", - "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "\n", - " form_ascii rate \n", - "0 i_ 22050 \n", - "1 i_ 22050 \n", - "2 i_ 22050 \n", - "3 i_ 22050 \n", - "4 i_ 22050 \n", - "5 i_ 22050 \n", - "6 i_ 22050 \n", - "7 i_ 22050 , uid language begin end duration \n", - "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", - "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", - "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", - "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", - "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", - "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", - "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", - "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", - "\n", - " source participant utterance_stripped utterance \n", - "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", - "1 /akhoe_haikom1/state_hospital tx@Es î î \n", - "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "\n", - " form_ascii rate \n", - "0 i_ 22050 \n", - "1 i_ 22050 \n", - "2 i_ 22050 \n", - "3 i_ 22050 \n", - "4 i_ 22050 \n", - "5 i_ 22050 \n", - "6 i_ 22050 \n", - "7 i_ 22050 , uid language begin end duration \n", - "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", - "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", - "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", - "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", - "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", - "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", - "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", - "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", - "\n", - " source participant utterance_stripped utterance \n", - "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", - "1 /akhoe_haikom1/state_hospital tx@Es î î \n", - "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "\n", - " form_ascii rate \n", - "0 i_ 22050 \n", - "1 i_ 22050 \n", - "2 i_ 22050 \n", - "3 i_ 22050 \n", - "4 i_ 22050 \n", - "5 i_ 22050 \n", - "6 i_ 22050 \n", - "7 i_ 22050 , uid language begin end duration \n", - "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", - "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", - "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", - "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", - "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", - "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", - "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", - "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", - "\n", - " source participant utterance_stripped utterance \n", - "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", - "1 /akhoe_haikom1/state_hospital tx@Es î î \n", - "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "\n", - " form_ascii rate \n", - "0 i_ 22050 \n", - "1 i_ 22050 \n", - "2 i_ 22050 \n", - "3 i_ 22050 \n", - "4 i_ 22050 \n", - "5 i_ 22050 \n", - "6 i_ 22050 \n", - "7 i_ 22050 , uid language begin end duration \n", - "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", - "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", - "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", - "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", - "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", - "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", - "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", - "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", - "\n", - " source participant utterance_stripped utterance \n", - "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", - "1 /akhoe_haikom1/state_hospital tx@Es î î \n", - "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "\n", - " form_ascii rate \n", - "0 i_ 22050 \n", - "1 i_ 22050 \n", - "2 i_ 22050 \n", - "3 i_ 22050 \n", - "4 i_ 22050 \n", - "5 i_ 22050 \n", - "6 i_ 22050 \n", - "7 i_ 22050 , uid language begin end duration \n", - "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", - "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", - "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", - "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", - "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", - "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", - "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", - "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", - "\n", - " source participant utterance_stripped utterance \n", - "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", - "1 /akhoe_haikom1/state_hospital tx@Es î î \n", - "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "\n", - " form_ascii rate \n", - "0 i_ 22050 \n", - "1 i_ 22050 \n", - "2 i_ 22050 \n", - "3 i_ 22050 \n", - "4 i_ 22050 \n", - "5 i_ 22050 \n", - "6 i_ 22050 \n", - "7 i_ 22050 , uid language begin end duration \n", - "0 akhoe_haikom-2-194-371425 akhoe_haikom 371425.0 371725.0 300.0 \\\n", - "1 akhoe_haikom-2-203-398335 akhoe_haikom 398335.0 398545.0 210.0 \n", - "2 akhoe_haikom-2-457-1245412 akhoe_haikom 1245412.0 1245780.0 368.0 \n", - "3 akhoe_haikom-2-459-1247720 akhoe_haikom 1247720.0 1248010.0 290.0 \n", - "4 akhoe_haikom-2-482-1290491 akhoe_haikom 1290491.0 1290851.0 360.0 \n", - "5 akhoe_haikom-2-484-1291852 akhoe_haikom 1291852.0 1292192.0 340.0 \n", - "6 akhoe_haikom-2-492-1300050 akhoe_haikom 1300050.0 1300810.0 760.0 \n", - "7 akhoe_haikom-2-494-1312920 akhoe_haikom 1312920.0 1313280.0 360.0 \n", - "\n", - " source participant utterance_stripped utterance \n", - "0 /akhoe_haikom1/state_hospital tx@Es î î \\\n", - "1 /akhoe_haikom1/state_hospital tx@Es î î \n", - "2 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "3 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "4 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "5 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "6 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "7 /akhoe_haikom1/state_hospital tx@Ga î î \n", - "\n", - " form_ascii rate \n", - "0 i_ 22050 \n", - "1 i_ 22050 \n", - "2 i_ 22050 \n", - "3 i_ 22050 \n", - "4 i_ 22050 \n", - "5 i_ 22050 \n", - "6 i_ 22050 \n", - "7 i_ 22050 , uid language begin end duration \n", - "8 akie-1-030-95822 akie 95822.0 96442.0 620.0 \\\n", - "9 akie-1-033-99525 akie 99525.0 100289.0 764.0 \n", - "10 akie-1-041-112708 akie 112708.0 113647.0 939.0 \n", - "11 akie-1-048-125403 akie 125403.0 125935.0 532.0 \n", - "12 akie-1-074-183638 akie 183638.0 184542.0 904.0 \n", - "13 akie-1-076-186733 akie 186733.0 187438.0 705.0 \n", - "14 akie-1-078-189573 akie 189573.0 190705.0 1132.0 \n", - "15 akie-1-083-198073 akie 198073.0 198794.0 721.0 \n", - "16 akie-1-087-205578 akie 205578.0 206094.0 516.0 \n", - "17 akie-1-103-230912 akie 230912.0 231810.0 898.0 \n", - "18 akie-1-105-234173 akie 234173.0 234831.0 658.0 \n", - "19 akie-1-108-237562 akie 237562.0 238347.0 785.0 \n", - "20 akie-1-127-274810 akie 274810.0 275768.0 958.0 \n", - "21 akie-1-132-285746 akie 285746.0 286489.0 743.0 \n", - "22 akie-1-146-315052 akie 315052.0 316031.0 979.0 \n", - "23 akie-1-149-320678 akie 320678.0 321089.0 411.0 \n", - "24 akie-1-153-327799 akie 327799.0 328542.0 743.0 \n", - "25 akie-1-156-332779 akie 332779.0 333547.0 768.0 \n", - "26 akie-1-172-373624 akie 373624.0 374426.0 802.0 \n", - "27 akie-1-176-380275 akie 380275.0 381003.0 728.0 \n", - "28 akie-1-178-386315 akie 386315.0 386870.0 555.0 \n", - "29 akie-1-180-392001 akie 392001.0 392513.0 512.0 \n", - "30 akie-1-182-395794 akie 395794.0 396578.0 784.0 \n", - "31 akie-1-190-411123 akie 411123.0 411734.0 611.0 \n", - "32 akie-1-193-414528 akie 414528.0 415213.0 685.0 \n", - "33 akie-1-196-417624 akie 417624.0 418247.0 623.0 \n", - "34 akie-1-198-420819 akie 420819.0 421148.0 329.0 \n", - "35 akie-1-209-441166 akie 441166.0 441654.0 488.0 \n", - "36 akie-1-212-449198 akie 449198.0 449791.0 593.0 \n", - "\n", - " source participant \n", - "8 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \\\n", - "9 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "10 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "11 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "12 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "13 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "14 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "15 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "16 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "17 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "18 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "19 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "20 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "21 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "22 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "23 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "24 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "25 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "26 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "27 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "28 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "29 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "30 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "31 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "32 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "33 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "34 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "35 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "36 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "\n", - " utterance_stripped utterance form_ascii rate \n", - "8 aá aá aa_ 22050 \n", - "9 aá aá aa_ 22050 \n", - "10 aá aá aa_ 22050 \n", - "11 aá aá aa_ 22050 \n", - "12 aá aá aa_ 22050 \n", - "13 aá aá aa_ 22050 \n", - "14 aá aá aa_ 22050 \n", - "15 aá aá aa_ 22050 \n", - "16 aá aá aa_ 22050 \n", - "17 aá aá aa_ 22050 \n", - "18 aá aá aa_ 22050 \n", - "19 aá aá aa_ 22050 \n", - "20 aá aá aa_ 22050 \n", - "21 aá aá aa_ 22050 \n", - "22 aá aá aa_ 22050 \n", - "23 aá aá aa_ 22050 \n", - "24 aá aá aa_ 22050 \n", - "25 aá aá aa_ 22050 \n", - "26 aá aá aa_ 22050 \n", - "27 aá aá aa_ 22050 \n", - "28 aá aá aa_ 22050 \n", - "29 aá aá aa_ 22050 \n", - "30 aá aá aa_ 22050 \n", - "31 aá aá aa_ 22050 \n", - "32 aá aá aa_ 22050 \n", - "33 aá aá aa_ 22050 \n", - "34 aá aá aa_ 22050 \n", - "35 aá aá aa_ 22050 \n", - "36 aá aá aa_ 22050 , uid language begin end duration \n", - "8 akie-1-030-95822 akie 95822.0 96442.0 620.0 \\\n", - "9 akie-1-033-99525 akie 99525.0 100289.0 764.0 \n", - "10 akie-1-041-112708 akie 112708.0 113647.0 939.0 \n", - "11 akie-1-048-125403 akie 125403.0 125935.0 532.0 \n", - "12 akie-1-074-183638 akie 183638.0 184542.0 904.0 \n", - "13 akie-1-076-186733 akie 186733.0 187438.0 705.0 \n", - "14 akie-1-078-189573 akie 189573.0 190705.0 1132.0 \n", - "15 akie-1-083-198073 akie 198073.0 198794.0 721.0 \n", - "16 akie-1-087-205578 akie 205578.0 206094.0 516.0 \n", - "17 akie-1-103-230912 akie 230912.0 231810.0 898.0 \n", - "18 akie-1-105-234173 akie 234173.0 234831.0 658.0 \n", - "19 akie-1-108-237562 akie 237562.0 238347.0 785.0 \n", - "20 akie-1-127-274810 akie 274810.0 275768.0 958.0 \n", - "21 akie-1-132-285746 akie 285746.0 286489.0 743.0 \n", - "22 akie-1-146-315052 akie 315052.0 316031.0 979.0 \n", - "23 akie-1-149-320678 akie 320678.0 321089.0 411.0 \n", - "24 akie-1-153-327799 akie 327799.0 328542.0 743.0 \n", - "25 akie-1-156-332779 akie 332779.0 333547.0 768.0 \n", - "26 akie-1-172-373624 akie 373624.0 374426.0 802.0 \n", - "27 akie-1-176-380275 akie 380275.0 381003.0 728.0 \n", - "28 akie-1-178-386315 akie 386315.0 386870.0 555.0 \n", - "29 akie-1-180-392001 akie 392001.0 392513.0 512.0 \n", - "30 akie-1-182-395794 akie 395794.0 396578.0 784.0 \n", - "31 akie-1-190-411123 akie 411123.0 411734.0 611.0 \n", - "32 akie-1-193-414528 akie 414528.0 415213.0 685.0 \n", - "33 akie-1-196-417624 akie 417624.0 418247.0 623.0 \n", - "34 akie-1-198-420819 akie 420819.0 421148.0 329.0 \n", - "35 akie-1-209-441166 akie 441166.0 441654.0 488.0 \n", - "36 akie-1-212-449198 akie 449198.0 449791.0 593.0 \n", - "\n", - " source participant \n", - "8 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \\\n", - "9 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "10 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "11 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "12 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "13 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "14 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "15 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "16 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "17 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "18 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "19 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "20 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "21 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "22 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "23 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "24 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "25 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "26 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "27 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "28 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "29 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "30 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "31 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "32 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "33 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "34 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "35 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "36 /akie1/2014-01-20Gitu4ConversationBahatiNkoiseyyo B \n", - "\n", - " utterance_stripped utterance form_ascii rate \n", - "8 aá aá aa_ 22050 \n", - "9 aá aá aa_ 22050 \n", - "10 aá aá aa_ 22050 \n", - "11 aá aá aa_ 22050 \n", - "12 aá aá aa_ 22050 \n", - "13 aá aá aa_ 22050 \n", - "14 aá aá aa_ 22050 \n", - "15 aá aá aa_ 22050 \n", - "16 aá aá aa_ 22050 \n", - "17 aá aá aa_ 22050 \n", - "18 aá aá aa_ 22050 \n", - "19 aá aá aa_ 22050 \n", - "20 aá aá aa_ 22050 \n", - "21 aá aá aa_ 22050 \n", - "22 aá aá aa_ 22050 \n", - "23 aá aá aa_ 22050 \n", - "24 aá aá aa_ 22050 \n", - "25 aá aá aa_ 22050 \n", - "26 aá aá aa_ 22050 \n", - "27 aá aá aa_ 22050 \n", - "28 aá aá aa_ 22050 \n", - "29 aá aá aa_ 22050 \n", - "30 aá aá aa_ 22050 \n", - "31 aá aá aa_ 22050 \n", - "32 aá aá aa_ 22050 \n", - "33 aá aá aa_ 22050 \n", - "34 aá aá aa_ 22050 \n", - "35 aá aá aa_ 22050 \n", - "36 aá aá aa_ 22050 ]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_2450982/2935496521.py:19: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " df_audio[\"rate\"] = rate\n" - ] - } - ], - "source": [ - "#execute as for loop\n", - "#[get_row_audio(df_audio[df_audio.source == source], './Elpaco dataset'+ source +'.wav') for source in df_audio.source]\n", - "#[get_sampling_rate(f'./Elpaco dataset{source}.wav') for source in df_audio.source[1:10]]\n", - "\n", - "df_audio_10 = [get_row_audio(df_audio[df_audio.source == source], f'./Elpaco dataset{source}.wav') for source in df_audio.source[1:10]]\n", - "#[get_sampling_rate('./Elpaco dataset'+ source +'.wav') for source in df_audio.source]\n", - "\n", - "print(df_audio_10)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "list indices must be integers or slices, not str", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[24], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mprint\u001b[39m(df_audio_10[\u001b[39m\"\u001b[39;49m\u001b[39mrate\u001b[39;49m\u001b[39m\"\u001b[39;49m])\n", - "\u001b[0;31mTypeError\u001b[0m: list indices must be integers or slices, not str" - ] - } - ], - "source": [ - "print(df_audio_10[\"rate\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/139551 [00:00 4\u001b[0m df_audios \u001b[39m=\u001b[39m parallel(\n\u001b[1;32m 5\u001b[0m delayed(get_row_audio)(\n\u001b[1;32m 6\u001b[0m df_audio[df_audio\u001b[39m.\u001b[39;49msource \u001b[39m==\u001b[39;49m source], \n\u001b[1;32m 7\u001b[0m \u001b[39m'\u001b[39;49m\u001b[39m./Elpaco dataset\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m+\u001b[39;49m source \u001b[39m+\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m.wav\u001b[39;49m\u001b[39m'\u001b[39;49m, \u001b[39m#Edit path to corpus directory here\u001b[39;49;00m\n\u001b[1;32m 8\u001b[0m )\n\u001b[1;32m 9\u001b[0m \u001b[39mfor\u001b[39;49;00m source \u001b[39min\u001b[39;49;00m tqdm(df_audio\u001b[39m.\u001b[39;49msource)\n\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m df_audio \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mconcat(df_audios)\n\u001b[1;32m 12\u001b[0m \u001b[39mlen\u001b[39m(df_audio)\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:1088\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 1085\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdispatch_one_batch(iterator):\n\u001b[1;32m 1086\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_iterating \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_original_iterator \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 1088\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdispatch_one_batch(iterator):\n\u001b[1;32m 1089\u001b[0m \u001b[39mpass\u001b[39;00m\n\u001b[1;32m 1091\u001b[0m \u001b[39mif\u001b[39;00m pre_dispatch \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mall\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mor\u001b[39;00m n_jobs \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m 1092\u001b[0m \u001b[39m# The iterable was consumed all at once by the above for loop.\u001b[39;00m\n\u001b[1;32m 1093\u001b[0m \u001b[39m# No need to wait for async callbacks to trigger to\u001b[39;00m\n\u001b[1;32m 1094\u001b[0m \u001b[39m# consumption.\u001b[39;00m\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:901\u001b[0m, in \u001b[0;36mParallel.dispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 900\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 901\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_dispatch(tasks)\n\u001b[1;32m 902\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mTrue\u001b[39;00m\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:819\u001b[0m, in \u001b[0;36mParallel._dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_lock:\n\u001b[1;32m 818\u001b[0m job_idx \u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_jobs)\n\u001b[0;32m--> 819\u001b[0m job \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_backend\u001b[39m.\u001b[39;49mapply_async(batch, callback\u001b[39m=\u001b[39;49mcb)\n\u001b[1;32m 820\u001b[0m \u001b[39m# A job can complete so quickly than its callback is\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[39m# called before we get here, causing self._jobs to\u001b[39;00m\n\u001b[1;32m 822\u001b[0m \u001b[39m# grow. To ensure correct results ordering, .insert is\u001b[39;00m\n\u001b[1;32m 823\u001b[0m \u001b[39m# used (rather than .append) in the following line\u001b[39;00m\n\u001b[1;32m 824\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_jobs\u001b[39m.\u001b[39minsert(job_idx, job)\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/_parallel_backends.py:208\u001b[0m, in \u001b[0;36mSequentialBackend.apply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mapply_async\u001b[39m(\u001b[39mself\u001b[39m, func, callback\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m 207\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Schedule a func to be run\"\"\"\u001b[39;00m\n\u001b[0;32m--> 208\u001b[0m result \u001b[39m=\u001b[39m ImmediateResult(func)\n\u001b[1;32m 209\u001b[0m \u001b[39mif\u001b[39;00m callback:\n\u001b[1;32m 210\u001b[0m callback(result)\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/_parallel_backends.py:597\u001b[0m, in \u001b[0;36mImmediateResult.__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, batch):\n\u001b[1;32m 595\u001b[0m \u001b[39m# Don't delay the application, to avoid keeping the input\u001b[39;00m\n\u001b[1;32m 596\u001b[0m \u001b[39m# arguments in memory\u001b[39;00m\n\u001b[0;32m--> 597\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresults \u001b[39m=\u001b[39m batch()\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:288\u001b[0m, in \u001b[0;36mBatchedCalls.__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 285\u001b[0m \u001b[39m# Set the default nested backend to self._backend but do not set the\u001b[39;00m\n\u001b[1;32m 286\u001b[0m \u001b[39m# change the default number of processes to -1\u001b[39;00m\n\u001b[1;32m 287\u001b[0m \u001b[39mwith\u001b[39;00m parallel_backend(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backend, n_jobs\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_n_jobs):\n\u001b[0;32m--> 288\u001b[0m \u001b[39mreturn\u001b[39;00m [func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 289\u001b[0m \u001b[39mfor\u001b[39;00m func, args, kwargs \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mitems]\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/joblib/parallel.py:288\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 284\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 285\u001b[0m \u001b[39m# Set the default nested backend to self._backend but do not set the\u001b[39;00m\n\u001b[1;32m 286\u001b[0m \u001b[39m# change the default number of processes to -1\u001b[39;00m\n\u001b[1;32m 287\u001b[0m \u001b[39mwith\u001b[39;00m parallel_backend(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backend, n_jobs\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_n_jobs):\n\u001b[0;32m--> 288\u001b[0m \u001b[39mreturn\u001b[39;00m [func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 289\u001b[0m \u001b[39mfor\u001b[39;00m func, args, kwargs \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mitems]\n", - "Cell \u001b[0;32mIn[4], line 9\u001b[0m, in \u001b[0;36mget_row_audio\u001b[0;34m(df_audio, wav_loc)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[39m#print path to audio file (optional)\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[39mprint\u001b[39m(wav_loc)\n\u001b[0;32m----> 9\u001b[0m data, rate \u001b[39m=\u001b[39m librosa\u001b[39m.\u001b[39;49mload(wav_loc)\n\u001b[1;32m 10\u001b[0m data \u001b[39m=\u001b[39m data\u001b[39m.\u001b[39mastype(\u001b[39m'\u001b[39m\u001b[39mfloat32\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 11\u001b[0m \u001b[39mprint\u001b[39m(data)\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/librosa/core/audio.py:190\u001b[0m, in \u001b[0;36mload\u001b[0;34m(path, sr, mono, offset, duration, dtype, res_type)\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[39m# Final cleanup for dtype and contiguity\u001b[39;00m\n\u001b[1;32m 189\u001b[0m \u001b[39mif\u001b[39;00m mono:\n\u001b[0;32m--> 190\u001b[0m y \u001b[39m=\u001b[39m to_mono(y)\n\u001b[1;32m 192\u001b[0m \u001b[39mif\u001b[39;00m sr \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 193\u001b[0m y \u001b[39m=\u001b[39m resample(y, orig_sr\u001b[39m=\u001b[39msr_native, target_sr\u001b[39m=\u001b[39msr, res_type\u001b[39m=\u001b[39mres_type)\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/librosa/core/audio.py:513\u001b[0m, in \u001b[0;36mto_mono\u001b[0;34m(y)\u001b[0m\n\u001b[1;32m 510\u001b[0m util\u001b[39m.\u001b[39mvalid_audio(y, mono\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 512\u001b[0m \u001b[39mif\u001b[39;00m y\u001b[39m.\u001b[39mndim \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m--> 513\u001b[0m y \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mmean(y, axis\u001b[39m=\u001b[39;49m\u001b[39mtuple\u001b[39;49m(\u001b[39mrange\u001b[39;49m(y\u001b[39m.\u001b[39;49mndim \u001b[39m-\u001b[39;49m \u001b[39m1\u001b[39;49m)))\n\u001b[1;32m 515\u001b[0m \u001b[39mreturn\u001b[39;00m y\n", - "File \u001b[0;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mmean\u001b[0;34m(*args, **kwargs)\u001b[0m\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3464\u001b[0m, in \u001b[0;36mmean\u001b[0;34m(a, axis, dtype, out, keepdims, where)\u001b[0m\n\u001b[1;32m 3461\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 3462\u001b[0m \u001b[39mreturn\u001b[39;00m mean(axis\u001b[39m=\u001b[39maxis, dtype\u001b[39m=\u001b[39mdtype, out\u001b[39m=\u001b[39mout, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m-> 3464\u001b[0m \u001b[39mreturn\u001b[39;00m _methods\u001b[39m.\u001b[39;49m_mean(a, axis\u001b[39m=\u001b[39;49maxis, dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[1;32m 3465\u001b[0m out\u001b[39m=\u001b[39;49mout, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", - "File \u001b[0;32m~/elpaco/lib/python3.8/site-packages/numpy/core/_methods.py:181\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims, where)\u001b[0m\n\u001b[1;32m 178\u001b[0m dtype \u001b[39m=\u001b[39m mu\u001b[39m.\u001b[39mdtype(\u001b[39m'\u001b[39m\u001b[39mf4\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 179\u001b[0m is_float16_result \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[0;32m--> 181\u001b[0m ret \u001b[39m=\u001b[39m umr_sum(arr, axis, dtype, out, keepdims, where\u001b[39m=\u001b[39;49mwhere)\n\u001b[1;32m 182\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(ret, mu\u001b[39m.\u001b[39mndarray):\n\u001b[1;32m 183\u001b[0m \u001b[39mwith\u001b[39;00m _no_nep50_warning():\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "#execute using parallel \n", - "\n", - "with Parallel(n_jobs=1, verbose=verbosity) as parallel:\n", - " df_audios = parallel(\n", - " delayed(get_row_audio)(\n", - " df_audio[df_audio.source == source], \n", - " './Elpaco dataset'+ source +'.wav', #Edit path to corpus directory here\n", - " )\n", - " for source in tqdm(df_audio.source)\n", - " )\n", - "df_audio = pd.concat(df_audios)\n", - "len(df_audio)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Normalize audio (optional)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# normalize audio if needed\n", - "df_audio['audio'] = [librosa.util.normalize(i) for i in df_audio.audio.values]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Plot `audio` column waveforms" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 5%|▍ | 11/240 [00:00<00:00, 254.29it/s]\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# plot some example audio \n", - "nrows = 20\n", - "ncols = 12\n", - "zoom = 2\n", - "fig, axs = plt.subplots(ncols=ncols, nrows = nrows,figsize = (ncols*zoom, nrows+zoom/1.5))\n", - "for i, turn in tqdm(enumerate(df_audio['audio'].values), total = nrows*ncols):\n", - " ax = axs.flatten()[i]\n", - " ax.plot(turn)\n", - " if i == nrows*ncols -1:\n", - " break" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save as `csv`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df_audio' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/u517177/3.1_Make_audio_and_spec_cols.ipynb Cell 16\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_audio\u001b[39m.\u001b[39mto_csv(\u001b[39m\"\u001b[39m\u001b[39mdf_audio.csv\u001b[39m\u001b[39m\"\u001b[39m, index\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m df_audio\n", - "\u001b[0;31mNameError\u001b[0m: name 'df_audio' is not defined" - ] - } - ], - "source": [ - "df_audio.to_csv(\"df_audio.csv\", index=False)\n", - "df_audio" + "sf.write('../data/output.wav', d_librosa, r_librosa, subtype='PCM_24')" ] } ], @@ -2823,7 +182,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.11" }, "vscode": { "interpreter": {