diff --git a/nltk_contrib/timex3.py b/nltk_contrib/timex3.py new file mode 100755 index 0000000..030dc62 --- /dev/null +++ b/nltk_contrib/timex3.py @@ -0,0 +1,366 @@ +# Code for tagging temporal expressions in text +# For details of the TIMEX format, see http://timex2.mitre.org/ +# Converted to Python3 by Brian Hockenmaier in 2019 + +import re +import string +import os +import sys +from datetime import datetime, timedelta + +# Python3 version no longer requires eGenix.com mx Base Distribution +# http://www.egenix.com/products/python/mxBase/ + +# Predefined strings. +numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \ + eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \ + eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \ + ninety|hundred|thousand)" +day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" +week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" +month = "(january|february|march|april|may|june|july|august|september| \ + october|november|december)" +dmy = "(year|day|week|month)" +rel_day = "(today|yesterday|tomorrow|tonight|tonite)" +exp1 = "(before|after|earlier|later|ago)" +exp2 = "(this|next|last)" +iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+" +year = "((?<=\s)\d{4}|^\d{4})" +regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")" +regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))" + +reg1 = re.compile(regxp1, re.IGNORECASE) +reg2 = re.compile(regxp2, re.IGNORECASE) +reg3 = re.compile(rel_day, re.IGNORECASE) +reg4 = re.compile(iso) +reg5 = re.compile(year) + +def tag(text): + + # Initialization + timex_found = [] + + # re.findall() finds all the substring matches, keep only the full + # matching string. Captures expressions such as 'number of days' ago, etc. + found = reg1.findall(text) + found = [a[0] for a in found if len(a) > 1] + for timex in found: + timex_found.append(timex) + + # Variations of this thursday, next year, etc + found = reg2.findall(text) + found = [a[0] for a in found if len(a) > 1] + for timex in found: + timex_found.append(timex) + + # today, tomorrow, etc + found = reg3.findall(text) + for timex in found: + timex_found.append(timex) + + # ISO + found = reg4.findall(text) + for timex in found: + timex_found.append(timex) + + # Year + found = reg5.findall(text) + for timex in found: + timex_found.append(timex) + + # Tag only temporal expressions which haven't been tagged. + for timex in timex_found: + text = re.sub(timex + '(?!)', '' + timex + '', text) + + return text + +# Hash function for week days to simplify the grounding task. +# [Mon..Sun] -> [0..6] +hashweekdays = { + 'monday': 0, + 'tuesday': 1, + 'wednesday': 2, + 'thursday': 3, + 'friday': 4, + 'saturday': 5, + 'sunday': 6} + +# Hash function for months to simplify the grounding task. +# [Jan..Dec] -> [1..12] +hashmonths = { + 'january': 1, + 'february': 2, + 'march': 3, + 'april': 4, + 'may': 5, + 'june': 6, + 'july': 7, + 'august': 8, + 'september': 9, + 'october': 10, + 'november': 11, + 'december': 12} + +# Hash number in words into the corresponding integer value +def hashnum(number): + if re.match(r'one|^a\b', number, re.IGNORECASE): + return 1 + if re.match(r'two', number, re.IGNORECASE): + return 2 + if re.match(r'three', number, re.IGNORECASE): + return 3 + if re.match(r'four', number, re.IGNORECASE): + return 4 + if re.match(r'five', number, re.IGNORECASE): + return 5 + if re.match(r'six', number, re.IGNORECASE): + return 6 + if re.match(r'seven', number, re.IGNORECASE): + return 7 + if re.match(r'eight', number, re.IGNORECASE): + return 8 + if re.match(r'nine', number, re.IGNORECASE): + return 9 + if re.match(r'ten', number, re.IGNORECASE): + return 10 + if re.match(r'eleven', number, re.IGNORECASE): + return 11 + if re.match(r'twelve', number, re.IGNORECASE): + return 12 + if re.match(r'thirteen', number, re.IGNORECASE): + return 13 + if re.match(r'fourteen', number, re.IGNORECASE): + return 14 + if re.match(r'fifteen', number, re.IGNORECASE): + return 15 + if re.match(r'sixteen', number, re.IGNORECASE): + return 16 + if re.match(r'seventeen', number, re.IGNORECASE): + return 17 + if re.match(r'eighteen', number, re.IGNORECASE): + return 18 + if re.match(r'nineteen', number, re.IGNORECASE): + return 19 + if re.match(r'twenty', number, re.IGNORECASE): + return 20 + if re.match(r'thirty', number, re.IGNORECASE): + return 30 + if re.match(r'forty', number, re.IGNORECASE): + return 40 + if re.match(r'fifty', number, re.IGNORECASE): + return 50 + if re.match(r'sixty', number, re.IGNORECASE): + return 60 + if re.match(r'seventy', number, re.IGNORECASE): + return 70 + if re.match(r'eighty', number, re.IGNORECASE): + return 80 + if re.match(r'ninety', number, re.IGNORECASE): + return 90 + if re.match(r'hundred', number, re.IGNORECASE): + return 100 + if re.match(r'thousand', number, re.IGNORECASE): + return 1000 + +# Given a timex_tagged_text and a Date object set to base_date, +# returns timex_grounded_text +def ground(tagged_text, base_date): + + # Find all identified timex and put them into a list + timex_regex = re.compile(r'.*?', re.DOTALL) + timex_found = timex_regex.findall(tagged_text) + timex_found = map(lambda timex:re.sub(r'', '', timex), \ + timex_found) + timexList = [] + + # Calculate the new date accordingly + for timex in timex_found: + # global month + month = "(january|february|march|april|may|june|july|august|september| \ + october|november|december)" + + timex_val = 'UNKNOWN' # Default value + + timex_ori = timex # Backup original timex for later substitution + + # If numbers are given in words, hash them into corresponding numbers. + # eg. twenty five days ago --> 25 days ago + if re.search(numbers, timex, re.IGNORECASE): + split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \ + timex, re.IGNORECASE) + value = split_timex[0] + unit = split_timex[1] + num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \ + value, re.IGNORECASE)) + timex = sum(num_list) + ' ' + unit + + # If timex matches ISO format, remove 'time' and reorder 'date' + if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex): + dmy = re.split(r'\s', timex)[0] + dmy = re.split(r'/|-', dmy) + timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0]) + + # Specific dates + elif re.match(r'\d{4}', timex): + timex_val = str(timex) + + # Relative dates + elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE): + timex_val = str(base_date) + elif re.match(r'yesterday', timex, re.IGNORECASE): + timex_val = str(base_date + timedelta(days=-1)) + elif re.match(r'tomorrow', timex, re.IGNORECASE): + timex_val = str(base_date + timedelta(days=+1)) + + # Weekday in the previous week. + elif re.match(r'last ' + week_day, timex, re.IGNORECASE): + target_day = hashweekdays[timex.split()[1]] + monday_of_base_week = base_date - timedelta(days=base_date.weekday()) + monday_of_target_week = base_date + timedelta(weeks=-1) + timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) + + # Weekday in the current week. + elif re.match(r'this ' + week_day, timex, re.IGNORECASE): + target_day = hashweekdays[timex.split()[1]] + monday_of_base_week = base_date - timedelta(days=base_date.weekday()) + monday_of_target_week = base_date + timedelta(weeks=0) + timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) + + # Weekday in the following week. + elif re.match(r'next ' + week_day, timex, re.IGNORECASE): + target_day = hashweekdays[timex.split()[1]] + monday_of_base_week = base_date - timedelta(days=base_date.weekday()) + monday_of_target_week = base_date + timedelta(weeks=+1) + timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) + + # Last, this, next week. + elif re.match(r'last week', timex, re.IGNORECASE): + year = (base_date + timedelta(weeks=-1)).year + + # iso_week returns a triple (year, week, day) hence, retrieve + # only week value. + week = (base_date + timedelta(weeks=-1)).isocalendar()[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'this week', timex, re.IGNORECASE): + year = (base_date + timedelta(weeks=0)).year + week = (base_date + timedelta(weeks=0)).isocalendar()[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'next week', timex, re.IGNORECASE): + year = (base_date + timedelta(weeks=+1)).year + week = (base_date + timedelta(weeks=+1)).isocalendar()[1] + timex_val = str(year) + 'W' + str(week) + + # Month in the previous year. + elif re.match(r'last ' + month, timex, re.IGNORECASE): + month = hashmonths[timex.split()[1]] + timex_val = str(base_date.year - 1) + '-' + str(month) + + # Month in the current year. + elif re.match(r'this ' + month, timex, re.IGNORECASE): + month = hashmonths[timex.split()[1]] + timex_val = str(base_date.year) + '-' + str(month) + + # Month in the following year. + elif re.match(r'next ' + month, timex, re.IGNORECASE): + month = hashmonths[timex.split()[1]] + timex_val = str(base_date.year + 1) + '-' + str(month) + elif re.match(r'last month', timex, re.IGNORECASE): + + # Handles the year boundary. + if base_date.month == 1: + timex_val = str(base_date.year - 1) + '-' + '12' + else: + timex_val = str(base_date.year) + '-' + str(base_date.month - 1) + elif re.match(r'this month', timex, re.IGNORECASE): + timex_val = str(base_date.year) + '-' + str(base_date.month) + elif re.match(r'next month', timex, re.IGNORECASE): + + # Handles the year boundary. + if base_date.month == 12: + timex_val = str(base_date.year + 1) + '-' + '1' + else: + timex_val = str(base_date.year) + '-' + str(base_date.month + 1) + elif re.match(r'last year', timex, re.IGNORECASE): + timex_val = str(base_date.year - 1) + elif re.match(r'this year', timex, re.IGNORECASE): + timex_val = str(base_date.year) + elif re.match(r'next year', timex, re.IGNORECASE): + timex_val = str(base_date.year + 1) + elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE): + + # Calculate the offset by taking '\d+' part from the timex. + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date + timedelta(days=-offset)) + elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date + timedelta(days=+offset)) + elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + year = (base_date + timedelta(weeks=-offset)).year + week = (base_date + \ + timedelta(weeks=-offset)).isocalendar()[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + year = (base_date + timedelta(weeks=+offset)).year + week = (base_date + timedelta(weeks=+offset)).isocalendar()[1] + timex_val = str(year) + 'W' + str(week) + elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE): + extra = 0 + offset = int(re.split(r'\s', timex)[0]) + + # Checks if subtracting the remainder of (offset / 12) to the base month + # crosses the year boundary. + if (base_date.month - offset % 12) < 1: + extra = 1 + + # Calculate new values for the year and the month. + year = str(base_date.year - offset // 12 - extra) + month = str((base_date.month - offset % 12) % 12) + + # Fix for the special case. + if month == '0': + month = '12' + timex_val = year + '-' + month + elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE): + extra = 0 + offset = int(re.split(r'\s', timex)[0]) + if (base_date.month + offset % 12) > 12: + extra = 1 + year = str(base_date.year + offset // 12 + extra) + month = str((base_date.month + offset % 12) % 12) + if month == '0': + month = '12' + timex_val = year + '-' + month + elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date.year - offset) + elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE): + offset = int(re.split(r'\s', timex)[0]) + timex_val = str(base_date.year + offset) + + # Remove 'time' from timex_val. + # For example, If timex_val = 2000-02-20 12:23:34.45, then + # timex_val = 2000-02-20 + timex_val = re.sub(r'\s.*', '', timex_val) + + # Substitute tag+timex in the text with grounded tag+timex. + tagged_text = re.sub('' + timex_ori + '', '' + timex_ori + '', tagged_text) + + timexList.append({ + "text": timex_ori, + "value": timex_val + }) + + return tagged_text, timexList + +#### + +def demo(): + import nltk + text = nltk.corpus.abc.raw('rural.txt')[:10000] + print(tag(text)) + +if __name__ == '__main__': + demo() \ No newline at end of file