Skip to content

Commit

Permalink
Merge pull request #156 from bartdw/master
Browse files Browse the repository at this point in the history
Added extract_duration support for dutch
  • Loading branch information
ChanceNCounter authored Nov 17, 2020
2 parents b8aa4e3 + 99640f9 commit ee3a580
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 17 deletions.
41 changes: 26 additions & 15 deletions lingua_franca/lang/parse_nl.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,28 +461,39 @@ def extract_duration_nl(text):
return None

time_units = {
'microseconden': None,
'milliseconden': None,
'seconden': None,
'minuten': None,
'uren': None,
'dagen': None,
'weken': None
'microseconds': 0,
'milliseconds': 0,
'seconds': 0,
'minutes': 0,
'hours': 0,
'days': 0,
'weeks': 0
}

pattern = r"(?P<value>\d+(?:\.?\d+)?)\s+{unit}s?"
nl_translations = {
'microseconds': ["microsecond", "microseconde", "microseconden", "microsecondje", "microsecondjes"],
'milliseconds': ["millisecond", "milliseconde", "milliseconden", "millisecondje", "millisecondjes"],
'seconds': ["second", "seconde", "seconden", "secondje", "secondjes"],
'minutes': ["minuut", "minuten", "minuutje", "minuutjes"],
'hours': ["uur", "uren", "uurtje", "uurtjes"],
'days': ["dag", "dagen", "dagje", "dagjes"],
'weeks': ["week", "weken", "weekje", "weekjes"]
}

pattern = r"(?P<value>\d+(?:\.?\d+)?)\s+{unit}"
text = _convert_words_to_numbers_nl(text)

for unit in time_units:
unit_pattern = pattern.format(unit=unit[:-1]) # remove 's' from unit
matches = re.findall(unit_pattern, text)
value = sum(map(float, matches))
time_units[unit] = value
text = re.sub(unit_pattern, '', text)
unit_nl_words = nl_translations[unit]
unit_nl_words.sort(key=len, reverse=True)
for unit_nl in unit_nl_words:
unit_pattern = pattern.format(unit=unit_nl)
matches = re.findall(unit_pattern, text)
value = sum(map(float, matches))
time_units[unit] = time_units[unit] + value
text = re.sub(unit_pattern, '', text)

text = text.strip()
# TODO unit arguments need to be in english
# translation was done wrong, exception thrown here
duration = timedelta(**time_units) if any(time_units.values()) else None

return (duration, text)
Expand Down
24 changes: 22 additions & 2 deletions test/test_parse_nl.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
# limitations under the License.
#
import unittest
from datetime import datetime, time
from datetime import datetime, time, timedelta

from lingua_franca import load_language, set_default_lang, unload_language
from lingua_franca.parse import extract_datetime, extract_number, normalize
from lingua_franca.parse import extract_datetime, extract_number, normalize, extract_duration


LANG = "nl-nl"
Expand Down Expand Up @@ -193,6 +193,26 @@ def test_numbers(self):
normalize("dit is achttien negentien twintig", LANG),
"dit is 18 19 20")

def test_extract_duration_nl(self):
self.assertEqual(extract_duration("een minuut", LANG),
(timedelta(seconds=60), ""))
self.assertEqual(extract_duration("10 minuten", LANG),
(timedelta(seconds=600), ""))
self.assertEqual(extract_duration("een uur en 2 minuten", LANG),
(timedelta(seconds=3720), "en"))
self.assertEqual(extract_duration("een dag", LANG),
(timedelta(days=1), ""))
self.assertEqual(extract_duration("twee dag", LANG),
(timedelta(days=2), ""))
self.assertEqual(extract_duration("vijf minuten na het uur", LANG),
(timedelta(seconds=300), "na het uur"))
self.assertEqual(extract_duration("zet een timer voor 1 uur", LANG),
(timedelta(seconds=3600), "zet 1 timer voor"))
self.assertEqual(extract_duration("een treinrit van 2 uur, 17 minuten en zestien seconden", LANG),
(timedelta(seconds=8236), "1 treinrit van , en"))
self.assertEqual(extract_duration("een uurtje", LANG),
(timedelta(seconds=3600), ""))


if __name__ == "__main__":
unittest.main()

0 comments on commit ee3a580

Please sign in to comment.