diff --git a/nltk_contrib/timex3.py b/nltk_contrib/timex3.py
new file mode 100755
index 0000000..030dc62
--- /dev/null
+++ b/nltk_contrib/timex3.py
@@ -0,0 +1,366 @@
+# Code for tagging temporal expressions in text
+# For details of the TIMEX format, see http://timex2.mitre.org/
+# Converted to Python3 by Brian Hockenmaier in 2019
+
+import re
+import string
+import os
+import sys
+from datetime import datetime, timedelta
+
+# Python3 version no longer requires eGenix.com mx Base Distribution
+# http://www.egenix.com/products/python/mxBase/
+
+# Predefined strings.
+numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \
+ eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \
+ eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \
+ ninety|hundred|thousand)"
+day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
+week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
+month = "(january|february|march|april|may|june|july|august|september| \
+ october|november|december)"
+dmy = "(year|day|week|month)"
+rel_day = "(today|yesterday|tomorrow|tonight|tonite)"
+exp1 = "(before|after|earlier|later|ago)"
+exp2 = "(this|next|last)"
+iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+"
+year = "((?<=\s)\d{4}|^\d{4})"
+regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")"
+regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))"
+
+reg1 = re.compile(regxp1, re.IGNORECASE)
+reg2 = re.compile(regxp2, re.IGNORECASE)
+reg3 = re.compile(rel_day, re.IGNORECASE)
+reg4 = re.compile(iso)
+reg5 = re.compile(year)
+
+def tag(text):
+
+ # Initialization
+ timex_found = []
+
+ # re.findall() finds all the substring matches, keep only the full
+ # matching string. Captures expressions such as 'number of days' ago, etc.
+ found = reg1.findall(text)
+ found = [a[0] for a in found if len(a) > 1]
+ for timex in found:
+ timex_found.append(timex)
+
+ # Variations of this thursday, next year, etc
+ found = reg2.findall(text)
+ found = [a[0] for a in found if len(a) > 1]
+ for timex in found:
+ timex_found.append(timex)
+
+ # today, tomorrow, etc
+ found = reg3.findall(text)
+ for timex in found:
+ timex_found.append(timex)
+
+ # ISO
+ found = reg4.findall(text)
+ for timex in found:
+ timex_found.append(timex)
+
+ # Year
+ found = reg5.findall(text)
+ for timex in found:
+ timex_found.append(timex)
+
+ # Tag only temporal expressions which haven't been tagged.
+ for timex in timex_found:
+ text = re.sub(timex + '(?!)', '' + timex + '', text)
+
+ return text
+
+# Hash function for week days to simplify the grounding task.
+# [Mon..Sun] -> [0..6]
+hashweekdays = {
+ 'monday': 0,
+ 'tuesday': 1,
+ 'wednesday': 2,
+ 'thursday': 3,
+ 'friday': 4,
+ 'saturday': 5,
+ 'sunday': 6}
+
+# Hash function for months to simplify the grounding task.
+# [Jan..Dec] -> [1..12]
+hashmonths = {
+ 'january': 1,
+ 'february': 2,
+ 'march': 3,
+ 'april': 4,
+ 'may': 5,
+ 'june': 6,
+ 'july': 7,
+ 'august': 8,
+ 'september': 9,
+ 'october': 10,
+ 'november': 11,
+ 'december': 12}
+
+# Hash number in words into the corresponding integer value
+def hashnum(number):
+ if re.match(r'one|^a\b', number, re.IGNORECASE):
+ return 1
+ if re.match(r'two', number, re.IGNORECASE):
+ return 2
+ if re.match(r'three', number, re.IGNORECASE):
+ return 3
+ if re.match(r'four', number, re.IGNORECASE):
+ return 4
+ if re.match(r'five', number, re.IGNORECASE):
+ return 5
+ if re.match(r'six', number, re.IGNORECASE):
+ return 6
+ if re.match(r'seven', number, re.IGNORECASE):
+ return 7
+ if re.match(r'eight', number, re.IGNORECASE):
+ return 8
+ if re.match(r'nine', number, re.IGNORECASE):
+ return 9
+ if re.match(r'ten', number, re.IGNORECASE):
+ return 10
+ if re.match(r'eleven', number, re.IGNORECASE):
+ return 11
+ if re.match(r'twelve', number, re.IGNORECASE):
+ return 12
+ if re.match(r'thirteen', number, re.IGNORECASE):
+ return 13
+ if re.match(r'fourteen', number, re.IGNORECASE):
+ return 14
+ if re.match(r'fifteen', number, re.IGNORECASE):
+ return 15
+ if re.match(r'sixteen', number, re.IGNORECASE):
+ return 16
+ if re.match(r'seventeen', number, re.IGNORECASE):
+ return 17
+ if re.match(r'eighteen', number, re.IGNORECASE):
+ return 18
+ if re.match(r'nineteen', number, re.IGNORECASE):
+ return 19
+ if re.match(r'twenty', number, re.IGNORECASE):
+ return 20
+ if re.match(r'thirty', number, re.IGNORECASE):
+ return 30
+ if re.match(r'forty', number, re.IGNORECASE):
+ return 40
+ if re.match(r'fifty', number, re.IGNORECASE):
+ return 50
+ if re.match(r'sixty', number, re.IGNORECASE):
+ return 60
+ if re.match(r'seventy', number, re.IGNORECASE):
+ return 70
+ if re.match(r'eighty', number, re.IGNORECASE):
+ return 80
+ if re.match(r'ninety', number, re.IGNORECASE):
+ return 90
+ if re.match(r'hundred', number, re.IGNORECASE):
+ return 100
+ if re.match(r'thousand', number, re.IGNORECASE):
+ return 1000
+
+# Given a timex_tagged_text and a Date object set to base_date,
+# returns timex_grounded_text
+def ground(tagged_text, base_date):
+
+ # Find all identified timex and put them into a list
+ timex_regex = re.compile(r'.*?', re.DOTALL)
+ timex_found = timex_regex.findall(tagged_text)
+ timex_found = map(lambda timex:re.sub(r'?TIMEX2.*?>', '', timex), \
+ timex_found)
+ timexList = []
+
+ # Calculate the new date accordingly
+ for timex in timex_found:
+ # global month
+ month = "(january|february|march|april|may|june|july|august|september| \
+ october|november|december)"
+
+ timex_val = 'UNKNOWN' # Default value
+
+ timex_ori = timex # Backup original timex for later substitution
+
+ # If numbers are given in words, hash them into corresponding numbers.
+ # eg. twenty five days ago --> 25 days ago
+ if re.search(numbers, timex, re.IGNORECASE):
+ split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \
+ timex, re.IGNORECASE)
+ value = split_timex[0]
+ unit = split_timex[1]
+ num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \
+ value, re.IGNORECASE))
+ timex = sum(num_list) + ' ' + unit
+
+ # If timex matches ISO format, remove 'time' and reorder 'date'
+ if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex):
+ dmy = re.split(r'\s', timex)[0]
+ dmy = re.split(r'/|-', dmy)
+ timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0])
+
+ # Specific dates
+ elif re.match(r'\d{4}', timex):
+ timex_val = str(timex)
+
+ # Relative dates
+ elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE):
+ timex_val = str(base_date)
+ elif re.match(r'yesterday', timex, re.IGNORECASE):
+ timex_val = str(base_date + timedelta(days=-1))
+ elif re.match(r'tomorrow', timex, re.IGNORECASE):
+ timex_val = str(base_date + timedelta(days=+1))
+
+ # Weekday in the previous week.
+ elif re.match(r'last ' + week_day, timex, re.IGNORECASE):
+ target_day = hashweekdays[timex.split()[1]]
+ monday_of_base_week = base_date - timedelta(days=base_date.weekday())
+ monday_of_target_week = base_date + timedelta(weeks=-1)
+ timex_val = str(monday_of_target_week + timedelta(days=target_day+1))
+
+ # Weekday in the current week.
+ elif re.match(r'this ' + week_day, timex, re.IGNORECASE):
+ target_day = hashweekdays[timex.split()[1]]
+ monday_of_base_week = base_date - timedelta(days=base_date.weekday())
+ monday_of_target_week = base_date + timedelta(weeks=0)
+ timex_val = str(monday_of_target_week + timedelta(days=target_day+1))
+
+ # Weekday in the following week.
+ elif re.match(r'next ' + week_day, timex, re.IGNORECASE):
+ target_day = hashweekdays[timex.split()[1]]
+ monday_of_base_week = base_date - timedelta(days=base_date.weekday())
+ monday_of_target_week = base_date + timedelta(weeks=+1)
+ timex_val = str(monday_of_target_week + timedelta(days=target_day+1))
+
+ # Last, this, next week.
+ elif re.match(r'last week', timex, re.IGNORECASE):
+ year = (base_date + timedelta(weeks=-1)).year
+
+ # iso_week returns a triple (year, week, day) hence, retrieve
+ # only week value.
+ week = (base_date + timedelta(weeks=-1)).isocalendar()[1]
+ timex_val = str(year) + 'W' + str(week)
+ elif re.match(r'this week', timex, re.IGNORECASE):
+ year = (base_date + timedelta(weeks=0)).year
+ week = (base_date + timedelta(weeks=0)).isocalendar()[1]
+ timex_val = str(year) + 'W' + str(week)
+ elif re.match(r'next week', timex, re.IGNORECASE):
+ year = (base_date + timedelta(weeks=+1)).year
+ week = (base_date + timedelta(weeks=+1)).isocalendar()[1]
+ timex_val = str(year) + 'W' + str(week)
+
+ # Month in the previous year.
+ elif re.match(r'last ' + month, timex, re.IGNORECASE):
+ month = hashmonths[timex.split()[1]]
+ timex_val = str(base_date.year - 1) + '-' + str(month)
+
+ # Month in the current year.
+ elif re.match(r'this ' + month, timex, re.IGNORECASE):
+ month = hashmonths[timex.split()[1]]
+ timex_val = str(base_date.year) + '-' + str(month)
+
+ # Month in the following year.
+ elif re.match(r'next ' + month, timex, re.IGNORECASE):
+ month = hashmonths[timex.split()[1]]
+ timex_val = str(base_date.year + 1) + '-' + str(month)
+ elif re.match(r'last month', timex, re.IGNORECASE):
+
+ # Handles the year boundary.
+ if base_date.month == 1:
+ timex_val = str(base_date.year - 1) + '-' + '12'
+ else:
+ timex_val = str(base_date.year) + '-' + str(base_date.month - 1)
+ elif re.match(r'this month', timex, re.IGNORECASE):
+ timex_val = str(base_date.year) + '-' + str(base_date.month)
+ elif re.match(r'next month', timex, re.IGNORECASE):
+
+ # Handles the year boundary.
+ if base_date.month == 12:
+ timex_val = str(base_date.year + 1) + '-' + '1'
+ else:
+ timex_val = str(base_date.year) + '-' + str(base_date.month + 1)
+ elif re.match(r'last year', timex, re.IGNORECASE):
+ timex_val = str(base_date.year - 1)
+ elif re.match(r'this year', timex, re.IGNORECASE):
+ timex_val = str(base_date.year)
+ elif re.match(r'next year', timex, re.IGNORECASE):
+ timex_val = str(base_date.year + 1)
+ elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE):
+
+ # Calculate the offset by taking '\d+' part from the timex.
+ offset = int(re.split(r'\s', timex)[0])
+ timex_val = str(base_date + timedelta(days=-offset))
+ elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE):
+ offset = int(re.split(r'\s', timex)[0])
+ timex_val = str(base_date + timedelta(days=+offset))
+ elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE):
+ offset = int(re.split(r'\s', timex)[0])
+ year = (base_date + timedelta(weeks=-offset)).year
+ week = (base_date + \
+ timedelta(weeks=-offset)).isocalendar()[1]
+ timex_val = str(year) + 'W' + str(week)
+ elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE):
+ offset = int(re.split(r'\s', timex)[0])
+ year = (base_date + timedelta(weeks=+offset)).year
+ week = (base_date + timedelta(weeks=+offset)).isocalendar()[1]
+ timex_val = str(year) + 'W' + str(week)
+ elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE):
+ extra = 0
+ offset = int(re.split(r'\s', timex)[0])
+
+ # Checks if subtracting the remainder of (offset / 12) to the base month
+ # crosses the year boundary.
+ if (base_date.month - offset % 12) < 1:
+ extra = 1
+
+ # Calculate new values for the year and the month.
+ year = str(base_date.year - offset // 12 - extra)
+ month = str((base_date.month - offset % 12) % 12)
+
+ # Fix for the special case.
+ if month == '0':
+ month = '12'
+ timex_val = year + '-' + month
+ elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE):
+ extra = 0
+ offset = int(re.split(r'\s', timex)[0])
+ if (base_date.month + offset % 12) > 12:
+ extra = 1
+ year = str(base_date.year + offset // 12 + extra)
+ month = str((base_date.month + offset % 12) % 12)
+ if month == '0':
+ month = '12'
+ timex_val = year + '-' + month
+ elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE):
+ offset = int(re.split(r'\s', timex)[0])
+ timex_val = str(base_date.year - offset)
+ elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE):
+ offset = int(re.split(r'\s', timex)[0])
+ timex_val = str(base_date.year + offset)
+
+ # Remove 'time' from timex_val.
+ # For example, If timex_val = 2000-02-20 12:23:34.45, then
+ # timex_val = 2000-02-20
+ timex_val = re.sub(r'\s.*', '', timex_val)
+
+ # Substitute tag+timex in the text with grounded tag+timex.
+ tagged_text = re.sub('' + timex_ori + '', '' + timex_ori + '', tagged_text)
+
+ timexList.append({
+ "text": timex_ori,
+ "value": timex_val
+ })
+
+ return tagged_text, timexList
+
+####
+
+def demo():
+ import nltk
+ text = nltk.corpus.abc.raw('rural.txt')[:10000]
+ print(tag(text))
+
+if __name__ == '__main__':
+ demo()
\ No newline at end of file