diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index 09d3d3c..c3734cd 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -19,11 +19,11 @@ "\n", "`scikit-talk` currently has the following parsers:\n", "\n", - "- `ChaFile.parse()`, which parsers .cha files.\n", + "- `Conversation.from_cha()`, which parses .cha files.\n", + "- `Conversation.from_eaf()`, which parses ELAN (.eaf) files.\n", "\n", "Future plans include the creation of parsers for:\n", "\n", - "- .eaf files\n", "- .TextGrid files\n", "- .xml files\n", "\n", @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -50,27 +50,40 @@ "For example, you can download a file from the\n", "[Griffith Corpus of Spoken Australian English](https://ca.talkbank.org/data-orig/GCSAusE/). This publicly available corpus contains transcription files in `.cha` format.\n", "\n", - "We use the `ChaFile.parse` module to create the `Conversation` object:" + "Another publicly available corpus is the [IFADV](https://www.fon.hum.uva.nl/IFA-SpokenLanguageCorpora/IFADVcorpus/Annotations/EAF/) corpus, which contains annotations as `.eaf` files.\n", + "\n", + "We will go over both options below." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parsing a `.cha` file\n", + "\n", + "From the Griffith corpus, we have downloaded [this file](https://ca.talkbank.org/data-orig/GCSAusE/01.cha).\n", + "\n", + "We will parse the file with the `Conversation.from_cha()` method, resulting in a `Conversation` object.:" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "griffith01 = sktalk.Conversation.from_cha('GCSAusE_01.cha')\n", + "griffith01 = sktalk.Conversation.from_cha('01.cha')\n", "\n", "griffith01" ] @@ -84,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -92,17 +105,17 @@ "text/plain": [ "[Utterance(utterance='0', participant='S', time=[0, 1500], begin='00:00:00.000', end='00:00:01.500', metadata=None, utterance_clean='0', utterance_list=['0'], n_words=1, n_characters=1, time_to_next=None, dyadic=None, FTO=None),\n", " Utterance(utterance=\"mm I'm glad I saw you⇗\", participant='S', time=[1500, 2775], begin='00:00:01.500', end='00:00:02.775', metadata=None, utterance_clean='mm Im glad I saw you', utterance_list=['mm', 'Im', 'glad', 'I', 'saw', 'you'], n_words=6, n_characters=15, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance=\"I thought I'd lost you (0.3)\", participant='S', time=[2775, 3773], begin='00:00:02.775', end='00:00:03.773', metadata=None, utterance_clean='I thought Id lost you 03', utterance_list=['I', 'thought', 'Id', 'lost', 'you', '03'], n_words=6, n_characters=19, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance=\"⌈no I've been here for a whi:le⌉,\", participant='H', time=[4052, 5515], begin='00:00:04.052', end='00:00:05.515', metadata=None, utterance_clean='no Ive been here for a while', utterance_list=['no', 'Ive', 'been', 'here', 'for', 'a', 'while'], n_words=7, n_characters=22, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance='⌊xxx⌋ (0.3)', participant='S', time=[4052, 5817], begin='00:00:04.052', end='00:00:05.817', metadata=None, utterance_clean='xxx 03', utterance_list=['xxx', '03'], n_words=2, n_characters=5, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance=\"⌊hm:: (.) if ʔI couldn't boʔrrow, (1.3) the second (0.2) book of readings fo:r\", participant='S', time=[6140, 9487], begin='00:00:06.140', end='00:00:09.487', metadata=None, utterance_clean='hm if ʔI couldnt boʔrrow the second book of readings for', utterance_list=['hm', 'if', 'ʔI', 'couldnt', 'boʔrrow', 'the', 'second', 'book', 'of', 'readings', 'for'], n_words=11, n_characters=46, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance='commu:nicating acro-', participant='H', time=[12888, 14050], begin='00:00:12.888', end='00:00:14.050', metadata=None, utterance_clean='communicating acro', utterance_list=['communicating', 'acro'], n_words=2, n_characters=17, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance='no: for family gender and sexuality', participant='H', time=[14050, 17014], begin='00:00:14.050', end='00:00:17.014', metadata=None, utterance_clean='no for family gender and sexuality', utterance_list=['no', 'for', 'family', 'gender', 'and', 'sexuality'], n_words=6, n_characters=29, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance=\"+≋ ah: that's the second on is itʔ\", participant='S', time=[17014, 18611], begin='00:00:17.014', end='00:00:18.611', metadata=None, utterance_clean=' ah thats the second on is itʔ', utterance_list=['ah', 'thats', 'the', 'second', 'on', 'is', 'itʔ'], n_words=7, n_characters=23, time_to_next=None, dyadic=None, FTO=None),\n", - " Utterance(utterance=\"+≋ I think it's s⌈ame family gender⌉ has a second book\", participant='H', time=[18611, 21090], begin='00:00:18.611', end='00:00:21.090', metadata=None, utterance_clean=' I think its same family gender has a second book', utterance_list=['I', 'think', 'its', 'same', 'family', 'gender', 'has', 'a', 'second', 'book'], n_words=10, n_characters=39, time_to_next=None, dyadic=None, FTO=None)]" + " Utterance(utterance=\"I thought I'd lost you\", participant='S', time=[2775, 3773], begin='00:00:02.775', end='00:00:03.773', metadata=None, utterance_clean='I thought Id lost you', utterance_list=['I', 'thought', 'Id', 'lost', 'you'], n_words=5, n_characters=17, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='le⌉,', participant='H', time=[4052, 5515], begin='00:00:04.052', end='00:00:05.515', metadata=None, utterance_clean='le', utterance_list=['le'], n_words=1, n_characters=2, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='⌊xxx⌋', participant='S', time=[4052, 5817], begin='00:00:04.052', end='00:00:05.817', metadata=None, utterance_clean='xxx', utterance_list=['xxx'], n_words=1, n_characters=3, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\": (.) if ʔI couldn't boʔrrow, (1.3)\", participant='S', time=[6140, 9487], begin='00:00:06.140', end='00:00:09.487', metadata=None, utterance_clean=' if ʔI couldnt boʔrrow 13', utterance_list=['if', 'ʔI', 'couldnt', 'boʔrrow', '13'], n_words=5, n_characters=20, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='r', participant='S', time=[9487, 12888], begin='00:00:09.487', end='00:00:12.888', metadata=None, utterance_clean='r', utterance_list=['r'], n_words=1, n_characters=1, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='nicating acro-', participant='H', time=[12888, 14050], begin='00:00:12.888', end='00:00:14.050', metadata=None, utterance_clean='nicating acro', utterance_list=['nicating', 'acro'], n_words=2, n_characters=12, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance='for family gender and sexuality', participant='H', time=[14050, 17014], begin='00:00:14.050', end='00:00:17.014', metadata=None, utterance_clean='for family gender and sexuality', utterance_list=['for', 'family', 'gender', 'and', 'sexuality'], n_words=5, n_characters=27, time_to_next=None, dyadic=None, FTO=None),\n", + " Utterance(utterance=\"that's the second on is itʔ\", participant='S', time=[17014, 18611], begin='00:00:17.014', end='00:00:18.611', metadata=None, utterance_clean='thats the second on is itʔ', utterance_list=['thats', 'the', 'second', 'on', 'is', 'itʔ'], n_words=6, n_characters=21, time_to_next=None, dyadic=None, FTO=None)]" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -113,13 +126,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'source': 'GCSAusE_01.cha',\n", + "{'source': '01.cha',\n", " 'UTF8': '',\n", " 'PID': '11312/t-00017232-1',\n", " 'Languages': ['eng'],\n", @@ -147,7 +160,7 @@ " 'Media': '01, audio'}" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -156,6 +169,211 @@ "griffith01.metadata" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can explore the conversation using the `summary` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0 - 1500) S: '0'\n", + "(1500 - 2775) S: 'mm I'm glad I saw you⇗'\n", + "(2775 - 3773) S: 'I thought I'd lost you'\n", + "(4052 - 5515) H: 'le⌉,'\n", + "(4052 - 5817) S: '⌊xxx⌋'\n", + "(6140 - 9487) S: ': (.) if ʔI couldn't boʔrrow, (1.3)'\n", + "(9487 - 12888) S: 'r'\n", + "(12888 - 14050) H: 'nicating acro-'\n", + "(14050 - 17014) H: 'for family gender and sexuality'\n", + "(17014 - 18611) S: 'that's the second on is itʔ'\n", + "(18611 - 21090) H: '+≋ I think it's s⌈ame family gender⌉ has a second book'\n", + "(19011 - 20132) S: '⌊whatever xxx⌋'\n", + "(21090 - 23087) H: 'not communicating across cultures'\n", + "(24457 - 25746) H: '⌈family gen⌈der has two'\n", + "(24457 - 25931) S: '⌊can- ⌊can I borrow it⇗'\n", + "(25931 - 26971) H: 'ʔh ⌈sure'\n", + "(26576 - 27215) S: '⌊thank you'\n", + "(27554 - 28309) H: 'I've got all my-'\n", + "(28700 - 30774) H: 'in fact all my reading books are all together,'\n", + "(31400 - 31876) H: 'so that'\n", + "(32276 - 33530) H: 'se them⇗'\n", + "(33800 - 34706) H: 'I do ∆sort of∆ think-'\n", + "(34706 - 38006) H: 'cause I don't think that one I'll be using (0.2) particularly'\n", + "(38100 - 39261) H: 'in⇗'\n", + "(40100 - 40518) S: 'ʔwhich ʔone'\n", + "(40918 - 41940) H: 'the family gender'\n", + "(42258 - 43175) H: 'I don't think it'd be-'\n", + "(43714 - 45664) H: 'though:: (.) you know something in:-'\n", + "(45800 - 47800) H: 'in the social context of Asian business'\n", + "(47800 - 49460) H: '∆°cause°∆ I missed half of that lecture⇗'\n" + ] + } + ], + "source": [ + "griffith01.summary(n=30)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This method also allows us to look in detail at e.g. a specific participant:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0 - 1500) S: '0'\n", + "(1500 - 2775) S: 'mm I'm glad I saw you⇗'\n", + "(2775 - 3773) S: 'I thought I'd lost you'\n", + "(4052 - 5817) S: '⌊xxx⌋'\n", + "(6140 - 9487) S: ': (.) if ʔI couldn't boʔrrow, (1.3)'\n" + ] + } + ], + "source": [ + "griffith01.summary(participant = 'S', n = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parsing an `.eaf` file\n", + "\n", + "From the IFADV corpus, we have downloaded [this file](https://www.fon.hum.uva.nl/IFA-SpokenLanguageCorpora/IFADVcorpus/Annotations/EAF/DVA3E.EAF).\n", + "\n", + "We will use the `Conversation.from_eaf()` method to parse the file, resulting in a `Conversation` object." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ifadv03 = sktalk.Conversation.from_eaf(\"DVA3E.EAF\")\n", + "\n", + "ifadv03" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ELAN formats are a bit more complex than `.cha` files, as they may contain additional annotations (e.g. for gestures). These annotations are stored in the ELAN format as different tiers, which end up in the `Conversation` object as utterances from different participants.\n", + "\n", + "We can look at the participants in the conversation:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'kijkrichting spreker1 [v] (TIE1)',\n", + " 'kijkrichting spreker2 [v] (TIE3)',\n", + " 'spreker1 [v] (TIE0)',\n", + " 'spreker2 [v] (TIE2)'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ifadv03.participants" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, we are only interested in `'spreker1 [v] (TIE0)'` and `'spreker2 [v] (TIE1)'`. We want to remove the other \"participants\" from the conversation." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'spreker1 [v] (TIE0)', 'spreker2 [v] (TIE2)'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ifadv03.remove(participant = 'kijkrichting spreker1 [v] (TIE1)')\n", + "ifadv03.remove(participant = 'kijkrichting spreker2 [v] (TIE3)')\n", + "\n", + "ifadv03.participants" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another way to ensure only the right tiers are included, is to specify the tiers we want to parse when we call the `from_eaf()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'spreker1 [v] (TIE0)', 'spreker2 [v] (TIE2)'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ifadv03 = sktalk.Conversation.from_eaf(\"DVA3E.EAF\", tiers = ['spreker1 [v] (TIE0)', 'spreker2 [v] (TIE2)'])\n", + "\n", + "ifadv03.participants" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -169,23 +387,23 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "⌈no I've been here for a whi:le⌉,\n", - "no Ive been here for a while\n", - "7\n" + "⌈family gen⌈der has two\n", + "family gender has two\n", + "4\n" ] } ], "source": [ - "print(griffith01.utterances[3].utterance)\n", - "print(griffith01.utterances[3].utterance_clean)\n", - "print(griffith01.utterances[3].n_words)" + "print(griffith01.utterances[13].utterance)\n", + "print(griffith01.utterances[13].utterance_clean)\n", + "print(griffith01.utterances[13].n_words)" ] }, { @@ -203,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -216,10 +434,10 @@ "[4052, 5515] H - FTO: 279\n", "[4052, 5817] S - FTO: None\n", "[6140, 9487] S - FTO: None\n", - "[12888, 14050] H - FTO: 3401\n", + "[9487, 12888] S - FTO: None\n", + "[12888, 14050] H - FTO: 0\n", "[14050, 17014] H - FTO: None\n", - "[17014, 18611] S - FTO: 0\n", - "[18611, 21090] H - FTO: 0\n" + "[17014, 18611] S - FTO: 0\n" ] } ], @@ -249,7 +467,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -258,7 +476,7 @@ "{'window': 10000, 'planning_buffer': 200, 'n_participants': 2}" ] }, - "execution_count": 7, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -281,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -291,7 +509,7 @@ " 'url': 'https://ca.talkbank.org/data-orig/GCSAusE/'}" ] }, - "execution_count": 8, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -312,16 +530,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "[]" ] }, - "execution_count": 9, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -347,7 +565,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -376,7 +594,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -399,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": {}, "outputs": [ {