Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python 3 version of timex library #26

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
366 changes: 366 additions & 0 deletions nltk_contrib/timex3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
# Code for tagging temporal expressions in text
# For details of the TIMEX format, see http://timex2.mitre.org/
# Converted to Python3 by Brian Hockenmaier in 2019

import re
import string
import os
import sys
from datetime import datetime, timedelta
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new timedelta features of python3 allow us to remove the dependency on mx.DateTime


# Python3 version no longer requires eGenix.com mx Base Distribution
# http://www.egenix.com/products/python/mxBase/

# Predefined strings.
numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \
eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \
eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \
ninety|hundred|thousand)"
day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
month = "(january|february|march|april|may|june|july|august|september| \
october|november|december)"
dmy = "(year|day|week|month)"
rel_day = "(today|yesterday|tomorrow|tonight|tonite)"
exp1 = "(before|after|earlier|later|ago)"
exp2 = "(this|next|last)"
iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+"
year = "((?<=\s)\d{4}|^\d{4})"
regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")"
regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))"

reg1 = re.compile(regxp1, re.IGNORECASE)
reg2 = re.compile(regxp2, re.IGNORECASE)
reg3 = re.compile(rel_day, re.IGNORECASE)
reg4 = re.compile(iso)
reg5 = re.compile(year)

def tag(text):

# Initialization
timex_found = []

# re.findall() finds all the substring matches, keep only the full
# matching string. Captures expressions such as 'number of days' ago, etc.
found = reg1.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)

# Variations of this thursday, next year, etc
found = reg2.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)

# today, tomorrow, etc
found = reg3.findall(text)
for timex in found:
timex_found.append(timex)

# ISO
found = reg4.findall(text)
for timex in found:
timex_found.append(timex)

# Year
found = reg5.findall(text)
for timex in found:
timex_found.append(timex)

# Tag only temporal expressions which haven't been tagged.
for timex in timex_found:
text = re.sub(timex + '(?!</TIMEX2>)', '<TIMEX2>' + timex + '</TIMEX2>', text)

return text

# Hash function for week days to simplify the grounding task.
# [Mon..Sun] -> [0..6]
hashweekdays = {
'monday': 0,
'tuesday': 1,
'wednesday': 2,
'thursday': 3,
'friday': 4,
'saturday': 5,
'sunday': 6}

# Hash function for months to simplify the grounding task.
# [Jan..Dec] -> [1..12]
hashmonths = {
'january': 1,
'february': 2,
'march': 3,
'april': 4,
'may': 5,
'june': 6,
'july': 7,
'august': 8,
'september': 9,
'october': 10,
'november': 11,
'december': 12}

# Hash number in words into the corresponding integer value
def hashnum(number):
if re.match(r'one|^a\b', number, re.IGNORECASE):
return 1
if re.match(r'two', number, re.IGNORECASE):
return 2
if re.match(r'three', number, re.IGNORECASE):
return 3
if re.match(r'four', number, re.IGNORECASE):
return 4
if re.match(r'five', number, re.IGNORECASE):
return 5
if re.match(r'six', number, re.IGNORECASE):
return 6
if re.match(r'seven', number, re.IGNORECASE):
return 7
if re.match(r'eight', number, re.IGNORECASE):
return 8
if re.match(r'nine', number, re.IGNORECASE):
return 9
if re.match(r'ten', number, re.IGNORECASE):
return 10
if re.match(r'eleven', number, re.IGNORECASE):
return 11
if re.match(r'twelve', number, re.IGNORECASE):
return 12
if re.match(r'thirteen', number, re.IGNORECASE):
return 13
if re.match(r'fourteen', number, re.IGNORECASE):
return 14
if re.match(r'fifteen', number, re.IGNORECASE):
return 15
if re.match(r'sixteen', number, re.IGNORECASE):
return 16
if re.match(r'seventeen', number, re.IGNORECASE):
return 17
if re.match(r'eighteen', number, re.IGNORECASE):
return 18
if re.match(r'nineteen', number, re.IGNORECASE):
return 19
if re.match(r'twenty', number, re.IGNORECASE):
return 20
if re.match(r'thirty', number, re.IGNORECASE):
return 30
if re.match(r'forty', number, re.IGNORECASE):
return 40
if re.match(r'fifty', number, re.IGNORECASE):
return 50
if re.match(r'sixty', number, re.IGNORECASE):
return 60
if re.match(r'seventy', number, re.IGNORECASE):
return 70
if re.match(r'eighty', number, re.IGNORECASE):
return 80
if re.match(r'ninety', number, re.IGNORECASE):
return 90
if re.match(r'hundred', number, re.IGNORECASE):
return 100
if re.match(r'thousand', number, re.IGNORECASE):
return 1000

# Given a timex_tagged_text and a Date object set to base_date,
# returns timex_grounded_text
def ground(tagged_text, base_date):

# Find all identified timex and put them into a list
timex_regex = re.compile(r'<TIMEX2>.*?</TIMEX2>', re.DOTALL)
timex_found = timex_regex.findall(tagged_text)
timex_found = map(lambda timex:re.sub(r'</?TIMEX2.*?>', '', timex), \
timex_found)
timexList = []
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new variable is used to return timex values as a list in addition to the timex tagged format.


# Calculate the new date accordingly
for timex in timex_found:
# global month
month = "(january|february|march|april|may|june|july|august|september| \
october|november|december)"

Comment on lines +179 to +181
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a bug in the original version here where the global variable month was overwritten and then unusable on subsequent calls to timex. Adding month in here solves the issue.

timex_val = 'UNKNOWN' # Default value

timex_ori = timex # Backup original timex for later substitution

# If numbers are given in words, hash them into corresponding numbers.
# eg. twenty five days ago --> 25 days ago
if re.search(numbers, timex, re.IGNORECASE):
split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \
timex, re.IGNORECASE)
value = split_timex[0]
unit = split_timex[1]
num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \
value, re.IGNORECASE))
timex = sum(num_list) + ' ' + unit

# If timex matches ISO format, remove 'time' and reorder 'date'
if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex):
dmy = re.split(r'\s', timex)[0]
dmy = re.split(r'/|-', dmy)
timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0])

# Specific dates
elif re.match(r'\d{4}', timex):
timex_val = str(timex)

# Relative dates
elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE):
timex_val = str(base_date)
elif re.match(r'yesterday', timex, re.IGNORECASE):
timex_val = str(base_date + timedelta(days=-1))
elif re.match(r'tomorrow', timex, re.IGNORECASE):
timex_val = str(base_date + timedelta(days=+1))

# Weekday in the previous week.
elif re.match(r'last ' + week_day, timex, re.IGNORECASE):
target_day = hashweekdays[timex.split()[1]]
monday_of_base_week = base_date - timedelta(days=base_date.weekday())
monday_of_target_week = base_date + timedelta(weeks=-1)
timex_val = str(monday_of_target_week + timedelta(days=target_day+1))

# Weekday in the current week.
elif re.match(r'this ' + week_day, timex, re.IGNORECASE):
target_day = hashweekdays[timex.split()[1]]
monday_of_base_week = base_date - timedelta(days=base_date.weekday())
monday_of_target_week = base_date + timedelta(weeks=0)
timex_val = str(monday_of_target_week + timedelta(days=target_day+1))

# Weekday in the following week.
elif re.match(r'next ' + week_day, timex, re.IGNORECASE):
target_day = hashweekdays[timex.split()[1]]
monday_of_base_week = base_date - timedelta(days=base_date.weekday())
monday_of_target_week = base_date + timedelta(weeks=+1)
timex_val = str(monday_of_target_week + timedelta(days=target_day+1))

# Last, this, next week.
elif re.match(r'last week', timex, re.IGNORECASE):
year = (base_date + timedelta(weeks=-1)).year

# iso_week returns a triple (year, week, day) hence, retrieve
# only week value.
week = (base_date + timedelta(weeks=-1)).isocalendar()[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'this week', timex, re.IGNORECASE):
year = (base_date + timedelta(weeks=0)).year
week = (base_date + timedelta(weeks=0)).isocalendar()[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'next week', timex, re.IGNORECASE):
year = (base_date + timedelta(weeks=+1)).year
week = (base_date + timedelta(weeks=+1)).isocalendar()[1]
timex_val = str(year) + 'W' + str(week)

# Month in the previous year.
elif re.match(r'last ' + month, timex, re.IGNORECASE):
month = hashmonths[timex.split()[1]]
timex_val = str(base_date.year - 1) + '-' + str(month)

# Month in the current year.
elif re.match(r'this ' + month, timex, re.IGNORECASE):
month = hashmonths[timex.split()[1]]
timex_val = str(base_date.year) + '-' + str(month)

# Month in the following year.
elif re.match(r'next ' + month, timex, re.IGNORECASE):
month = hashmonths[timex.split()[1]]
timex_val = str(base_date.year + 1) + '-' + str(month)
elif re.match(r'last month', timex, re.IGNORECASE):

# Handles the year boundary.
if base_date.month == 1:
timex_val = str(base_date.year - 1) + '-' + '12'
else:
timex_val = str(base_date.year) + '-' + str(base_date.month - 1)
elif re.match(r'this month', timex, re.IGNORECASE):
timex_val = str(base_date.year) + '-' + str(base_date.month)
elif re.match(r'next month', timex, re.IGNORECASE):

# Handles the year boundary.
if base_date.month == 12:
timex_val = str(base_date.year + 1) + '-' + '1'
else:
timex_val = str(base_date.year) + '-' + str(base_date.month + 1)
elif re.match(r'last year', timex, re.IGNORECASE):
timex_val = str(base_date.year - 1)
elif re.match(r'this year', timex, re.IGNORECASE):
timex_val = str(base_date.year)
elif re.match(r'next year', timex, re.IGNORECASE):
timex_val = str(base_date.year + 1)
elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE):

# Calculate the offset by taking '\d+' part from the timex.
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date + timedelta(days=-offset))
elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date + timedelta(days=+offset))
elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
year = (base_date + timedelta(weeks=-offset)).year
week = (base_date + \
timedelta(weeks=-offset)).isocalendar()[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
year = (base_date + timedelta(weeks=+offset)).year
week = (base_date + timedelta(weeks=+offset)).isocalendar()[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE):
extra = 0
offset = int(re.split(r'\s', timex)[0])

# Checks if subtracting the remainder of (offset / 12) to the base month
# crosses the year boundary.
if (base_date.month - offset % 12) < 1:
extra = 1

# Calculate new values for the year and the month.
year = str(base_date.year - offset // 12 - extra)
month = str((base_date.month - offset % 12) % 12)

# Fix for the special case.
if month == '0':
month = '12'
timex_val = year + '-' + month
elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE):
extra = 0
offset = int(re.split(r'\s', timex)[0])
if (base_date.month + offset % 12) > 12:
extra = 1
year = str(base_date.year + offset // 12 + extra)
month = str((base_date.month + offset % 12) % 12)
if month == '0':
month = '12'
timex_val = year + '-' + month
elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date.year - offset)
elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date.year + offset)

# Remove 'time' from timex_val.
# For example, If timex_val = 2000-02-20 12:23:34.45, then
# timex_val = 2000-02-20
timex_val = re.sub(r'\s.*', '', timex_val)

# Substitute tag+timex in the text with grounded tag+timex.
tagged_text = re.sub('<TIMEX2>' + timex_ori + '</TIMEX2>', '<TIMEX2 val=\"' \
+ timex_val + '\">' + timex_ori + '</TIMEX2>', tagged_text)

timexList.append({
"text": timex_ori,
"value": timex_val
})

return tagged_text, timexList

####

def demo():
import nltk
text = nltk.corpus.abc.raw('rural.txt')[:10000]
print(tag(text))

if __name__ == '__main__':
demo()